Introduction¶

In this project, our goal was to predict stock prices using a machine learning approach. To achieve this, we designed and implemented a model based on a set of carefully chosen features. These features included technical indicators such as Relative Strength Index (RSI), Money Flow Index (MFI), Exponential Moving Averages (EMA), Simple Moving Average (SMA),Moving Average Convergence Divergence (MACD) as well as historical price data encompassing the previous 1 day, 3 days, 5 days, and 1, 2, 3, 4 weeks. Additionally, rolling average values for high, low, open, close, adjusted close, and volume were incorporated.

Import Libraries¶

In [1]:
import os
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor


from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('display.max_columns', None)


# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
#init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
warnings.filterwarnings("ignore")

Functions¶

In [2]:
def evaluate_regression_model(y_true, y_pred):
    """
    Calculate and print evaluation metrics for a regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.

    Returns:
    - Dictionary containing the evaluation metrics.
    """
    # Calculate evaluation metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Print the results
    print(f'Mean Squared Error (MSE): {np.round(mse,3)}')
    print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
    print(f'Mean Absolute Error (MAE): {np.round(mae,3)}')
    print(f'R-squared (R2): {np.round(r2,3)}')

    # Return results as a dictionary
    results = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

    return results
In [3]:
def evaluate_regression_model2(y_true, y_pred):
    """
    Calculate and print evaluation metrics for a regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.

    Returns:
    - Dictionary containing the evaluation metrics.
    """
    # Calculate evaluation metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Return results as a dictionary
    results = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

    return results
In [4]:
# Returns RSI values
def rsi(df, periods = 14):
    
    """
    Calculate the Relative Strength Index (RSI) for a given DataFrame.

    Parameters:
    - df (DataFrame): Pandas DataFrame with a 'close' column.
    - periods (int): Number of periods to consider for RSI calculation. Default is 14.

    Returns:
    - Series: A pandas Series containing the RSI values.
    """
        
        
    close = df['close']
    close_delta = close.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()

    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi
In [5]:
def gain(x):
    return ((x > 0) * x).sum()


def loss(x):
    return ((x < 0) * x).sum()


def mfi(df, n=14):
    """
    Calculate the Money Flow Index (MFI) for a given DataFrame.

    Parameters:
    - df (DataFrame): Pandas DataFrame with 'high', 'low', 'close', and 'volume' columns.
    - n (int): Number of periods to use for the MFI calculation. Default is 14.

    Returns:
    - numpy.ndarray: An array containing the MFI values.
    """
    
    high = df['high']
    low = df['low']
    close = df['close']
    volume = df['volume']
    
    typical_price = (high + low + close) / 3
    money_flow = typical_price * volume
    mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
    signed_mf = money_flow * mf_sign

    # Calculate gain and loss using vectorized operations
    positive_mf = np.where(signed_mf > 0, signed_mf, 0)
    negative_mf = np.where(signed_mf < 0, -signed_mf, 0)

    mf_avg_gain = pd.Series(positive_mf).rolling(n, min_periods=1).sum()
    mf_avg_loss = pd.Series(negative_mf).rolling(n, min_periods=1).sum()

    return (100 - 100 / (1 + mf_avg_gain / mf_avg_loss)).to_numpy()
In [6]:
def plot_regression_accuracy(y_true, y_pred):
    """
    Create various plots to evaluate the accuracy of a linear regression model.

    Parameters:
    - y_true: Actual values.
    - y_pred: Predicted values.
    """
    # Scatter Plot
    plt.scatter(y_true, y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Scatter Plot of Actual vs Predicted Values')
    plt.show()

    # Residual Plot
    residuals = y_true - y_pred
    plt.scatter(y_pred, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.show()

    # Distribution of Residuals
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title('Distribution of Residuals')
    plt.show()

    # Predicted vs Actual Line
    plt.plot(y_true, y_true, linestyle='--', color='r', label='Perfect Fit')
    plt.scatter(y_true, y_pred)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Predicted vs Actual Values with Perfect Fit Line')
    plt.legend()
    plt.show()
In [7]:
def plot_predictions(df,prediction):
    
    """
    Create a Plotly graph to compare actual values with predictions.

    Parameters:
    - df (DataFrame): A pandas DataFrame containing 'date' and 'close_1d_next' columns.
    - prediction (array-like): Predicted values corresponding to the test set.
    """
        
    plot_test_df= df[df.date.dt.year>=2020]
    plot_test_df['prediction'] = prediction

    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(go.Scatter(x=df.date, y=df.close_1d_next,
                             name='Truth',
                             marker_color='LightSkyBlue'), row=1, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=plot_test_df.prediction,
                             name='Prediction',
                             marker_color='MediumPurple'), row=1, col=1)
    
    # Add title and Y-axis title for the first subplot
    fig.update_layout(title_text='Train Data and Test Data', title_x=0.5, title_y=0.9)
    fig.update_yaxes(title_text='Prediction', row=1, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=y_test,
                             name='Truth',
                             marker_color='LightSkyBlue',
                             showlegend=False), row=2, col=1)

    fig.add_trace(go.Scatter(x=plot_test_df.date,
                             y=prediction,
                             name='Prediction',
                             marker_color='MediumPurple',
                             showlegend=False), row=2, col=1)
    
    fig.update_yaxes(title_text='Prediction', row=2, col=1)

    fig.show()
In [8]:
def plot_feature_importance(model,X_train,top_features):

    """
    Plot the feature importance from a linear regression model and return a sorted DataFrame of feature importances.

    Parameters:
    - model: A trained linear regression model with a coef_ attribute.
    - X_train (DataFrame): The DataFrame used to train the model, for feature names.
    - num_top_features (int): Number of top features to display.

    Returns:
    - DataFrame: Sorted DataFrame with features and their importance.
    """
    
    # Get feature importance scores (coefficients)
    feature_importance = model.coef_

    # Create a DataFrame to store feature names and importance scores
    feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': np.abs(feature_importance)})

    # Sort features by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

    top_features = top_features
    # Plot feature importance
    plt.figure(figsize=(20, 6))
    plt.barh(range(len(feature_importance_df[:top_features])), feature_importance_df[:top_features]['Importance'], align="center")
    plt.yticks(range(len(feature_importance_df[:top_features])), labels=feature_importance_df[:top_features]['Feature'])
    plt.ylabel("Features")
    plt.xlabel("Coefficient Magnitude")
    plt.title(f"Top {top_features} Feature Importance Values")
    plt.show()
    
    return feature_importance_df

Read Data¶

In [9]:
out_loc  = '/Users/isapocan/Desktop/LSU/data/'
# Define the file path for the parquet file
parquet_file_path = out_loc + "stock_1d.parquet"
In [10]:
try:
    # Read the Parquet file into a DataFrame
    df = pd.read_parquet(parquet_file_path)

    # Convert column names to lowercase for consistency
    df.columns = df.columns.str.lower()

    # Display the first few rows of the DataFrame
    display(df.head())

except Exception as e:
    print(f"An error occurred while reading the file: {e}")
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
0 2013-01-02 94.190002 94.790001 93.959999 94.779999 67.895119 3206700.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
1 2013-01-03 94.339996 94.930000 94.129997 94.669998 67.816322 2704600.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
2 2013-01-04 94.790001 95.480003 94.540001 95.370003 68.317757 2704900.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
3 2013-01-07 95.019997 95.730003 94.760002 95.489998 68.403717 2745800.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902
4 2013-01-08 95.169998 95.750000 95.099998 95.500000 68.410889 2655500.0 MMM 3M Industrials Industrial Conglomerates Saint Paul, Minnesota 1957-03-04 66740 1902

Select Stock¶

In [11]:
# Filter the DataFrame to include only rows where 'symbol' is 'MDLZ'
df = df[df['symbol']=='MDLZ']

# Display the first few rows and the shape of the filtered DataFrame
display(df.head())
display(df.shape)
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
852843 2013-01-02 25.840000 26.690001 25.780001 26.670000 21.445908 17862400.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852844 2013-01-03 26.700001 26.770000 26.490000 26.639999 21.421791 9075500.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852845 2013-01-04 26.700001 26.830000 26.549999 26.740000 21.502203 7696000.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852846 2013-01-07 26.620001 26.740000 26.549999 26.660000 21.437866 7576200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
852847 2013-01-08 26.520000 26.920000 26.459999 26.680000 21.453959 14360800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012
(2733, 15)

1. Feature engineering¶

1.a. Financial indicators¶

  • Relative Strength Index (RSI):

Description: RSI helps you understand if a stock is likely to be overbought (prices too high) or oversold (prices too low). It looks at recent price changes to make this determination.

  • Money Flow Index (MFI):

Description: MFI considers both price and trading volume to identify if a stock is overbought or oversold. It helps gauge the strength of buying and selling pressure.

  • Exponential Moving Average (EMA):

Description: EMA smoothens out price data, giving more weight to recent prices. It reacts faster to price changes compared to a Simple Moving Average (SMA), making it useful for trend analysis.

  • Simple Moving Average (SMA):

Description: SMA is a basic average of stock prices over a specific period. It provides a smoothed representation of the overall price trend, helping to identify general market direction.

  • Moving Average Convergence Divergence (MACD):

Description: MACD is a trend-following momentum indicator that shows the relationship between two moving averages of a security's price. It helps identify potential trend reversals or momentum shifts.

  • MACD Signal Line (MACD_signal):

Description: The MACD signal line is a nine-day EMA of the MACD. It is used to generate trading signals. When the MACD crosses above the signal line, it might be a signal to buy, and when it crosses below, it might be a signal to sell.

  • Lag and Rolling Average values of high, low, open, close, adjusted close, and volume.
In [12]:
def add_moving_averages(df, column_name):
    """
    Adds various moving averages to the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to calculate moving averages for.
    """
    # Exponential Moving Average (EMA)
    df['ema_9'] = df[column_name].ewm(span=9).mean().shift()

    # Simple Moving Averages (SMA) with different periods
    for period in [5, 10, 15, 30]:
        df[f'sma_{period}'] = df[column_name].rolling(window=period).mean().shift()

# Add moving averages for the 'close' column
add_moving_averages(df, 'close')

df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 852843 to 855575
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2733 non-null   datetime64[ns]
 1   open                   2733 non-null   float64       
 2   high                   2733 non-null   float64       
 3   low                    2733 non-null   float64       
 4   close                  2733 non-null   float64       
 5   adj close              2733 non-null   float64       
 6   volume                 2733 non-null   float64       
 7   symbol                 2733 non-null   object        
 8   security               2733 non-null   object        
 9   gics sector            2733 non-null   object        
 10  gics sub-industry      2733 non-null   object        
 11  headquarters location  2733 non-null   object        
 12  date added             2733 non-null   object        
 13  cik                    2733 non-null   int64         
 14  founded                2733 non-null   object        
 15  ema_9                  2732 non-null   float64       
 16  sma_5                  2728 non-null   float64       
 17  sma_10                 2723 non-null   float64       
 18  sma_15                 2718 non-null   float64       
 19  sma_30                 2703 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1), object(7)
memory usage: 448.4+ KB
In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2733 entries, 852843 to 855575
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2733 non-null   datetime64[ns]
 1   open                   2733 non-null   float64       
 2   high                   2733 non-null   float64       
 3   low                    2733 non-null   float64       
 4   close                  2733 non-null   float64       
 5   adj close              2733 non-null   float64       
 6   volume                 2733 non-null   float64       
 7   symbol                 2733 non-null   object        
 8   security               2733 non-null   object        
 9   gics sector            2733 non-null   object        
 10  gics sub-industry      2733 non-null   object        
 11  headquarters location  2733 non-null   object        
 12  date added             2733 non-null   object        
 13  cik                    2733 non-null   int64         
 14  founded                2733 non-null   object        
 15  ema_9                  2732 non-null   float64       
 16  sma_5                  2728 non-null   float64       
 17  sma_10                 2723 non-null   float64       
 18  sma_15                 2718 non-null   float64       
 19  sma_30                 2703 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1), object(7)
memory usage: 448.4+ KB
In [14]:
# Add a Relative Strength Index (RSI) column to the DataFrame
try:
    df['rsi'] = rsi(df) # Uncomment and adjust fillna(0) if appropriate for handling missing values
except Exception as e:
    print(f"Error calculating RSI: {e}")

# Add a Money Flow Index (MFI) column to the DataFrame
try:
    df['mfi'] = mfi(df, 14) # The second argument is the period, here assumed to be 14
except Exception as e:
    print(f"Error calculating MFI: {e}")
In [15]:
df[['date','close','ema_9','sma_5','sma_10','sma_15','sma_30','rsi','mfi']]
Out[15]:
date close ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi
852843 2013-01-02 26.670000 NaN NaN NaN NaN NaN NaN 0.000000
852844 2013-01-03 26.639999 26.670000 NaN NaN NaN NaN NaN 33.904295
852845 2013-01-04 26.740000 26.653333 NaN NaN NaN NaN NaN 48.695375
852846 2013-01-07 26.660000 26.688852 NaN NaN NaN NaN NaN 39.919745
852847 2013-01-08 26.680000 26.679078 NaN NaN NaN NaN NaN 55.233142
... ... ... ... ... ... ... ... ... ...
855571 2023-11-02 67.970001 65.583879 65.934001 65.321001 64.406001 65.993000 60.257764 89.207420
855572 2023-11-03 68.820000 66.061103 66.398001 65.697001 64.868001 65.901667 63.726091 89.458580
855573 2023-11-06 68.239998 66.612883 67.160001 66.169001 65.354001 65.848000 59.885606 83.710782
855574 2023-11-07 68.489998 66.938306 67.612000 66.594001 65.728667 65.799000 60.977252 75.937617
855575 2023-11-08 69.019997 67.248644 68.067999 66.888000 66.058667 65.729667 63.259914 75.566164

2733 rows × 9 columns

In [16]:
# Calculate and display the correlation between 'rsi' and 'mfi'
if {'rsi', 'mfi'}.issubset(df.columns):
    correlation = df[['rsi', 'mfi']].corr()
    print(correlation)
else:
    print("DataFrame does not contain 'rsi' and/or 'mfi' columns.")
          rsi       mfi
rsi  1.000000  0.698958
mfi  0.698958  1.000000
In [17]:
# calculating the Moving Average Convergence Divergence (MACD) and its signal line

# Ensure the 'close' column exists in the DataFrame
if 'close' in df.columns:
    # Calculate the 12-period EMA of the closing prices
    df['macd'] = df['close'].ewm(span=12, min_periods=12).mean() - df['close'].ewm(span=26, min_periods=26).mean()

    # Calculate the 9-period EMA of the MACD values (signal line)
    df['macd_signal'] = df['macd'].ewm(span=9, min_periods=9).mean()
else:
    print("DataFrame does not contain 'close' column.")
In [18]:
# Check if the required columns exist in the DataFrame
if {'macd', 'macd_signal'}.issubset(df.columns):
    # Select rows where 'macd' and 'macd_signal' columns do not have missing values
    filtered_df = df[(~df['macd'].isna()) & (~df['macd_signal'].isna())]
    
    # Display the first few rows of these columns
    print(filtered_df[['macd', 'macd_signal']].head())
else:
    print("DataFrame does not contain 'macd' and/or 'macd_signal' columns.")
            macd  macd_signal
852876 -0.147786    -0.050945
852877 -0.175230    -0.078792
852878 -0.198438    -0.104970
852879 -0.235462    -0.132994
852880 -0.226841    -0.152855

1.b. Shift Close price in order to predict next day¶

In [19]:
# Check if the required columns exist in the DataFrame
if {'date', 'close'}.issubset(df.columns):
    # Create a new column representing the next day's closing price
    df['close_1d_next'] = df['close'].shift(-1)

    # Display the first few rows including 'date', 'close', and 'close_1d_next'
    print(df[['date', 'close', 'close_1d_next']].head())
else:
    print("DataFrame does not contain 'date' and/or 'close' columns.")
             date      close  close_1d_next
852843 2013-01-02  26.670000      26.639999
852844 2013-01-03  26.639999      26.740000
852845 2013-01-04  26.740000      26.660000
852846 2013-01-07  26.660000      26.680000
852847 2013-01-08  26.680000      27.049999

1.c. Add lag features¶

In [20]:
def add_lagged_features(df, column_name, lags):
    """
    Adds lagged features for a specified column in the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to create lagged features for.
    - lags (list of int): The list of lag periods.
    """
    for lag in lags:
        df[f'{column_name}_{lag}d_ago'] = df[column_name].shift(lag)

def add_rolling_avg_features(df, column_name, windows):
    """
    Adds rolling average features for a specified column in the DataFrame.

    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The column name to create rolling average features for.
    - windows (list of int): The list of rolling window sizes.
    """
    for window in windows:
        df[f'{column_name}_{window}d_avg'] = df[column_name].rolling(window=window).mean()

# Define lag periods and rolling window sizes
lag_periods = [1, 3, 5, 7, 14, 21, 28]
rolling_windows = [3, 5, 7, 10, 15, 30]

# Columns to create features for
columns = ['close', 'adj close', 'open', 'high', 'low', 'volume']

# Add lagged and rolling average features for each column
for column in columns:
    add_lagged_features(df, column, lag_periods)
    add_rolling_avg_features(df, column, rolling_windows)

# View the DataFrame
df.head()
Out[20]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi macd macd_signal close_1d_next close_1d_ago close_3d_ago close_5d_ago close_7d_ago close_14d_ago close_21d_ago close_28d_ago close_3d_avg close_5d_avg close_7d_avg close_10d_avg close_15d_avg close_30d_avg adj close_1d_ago adj close_3d_ago adj close_5d_ago adj close_7d_ago adj close_14d_ago adj close_21d_ago adj close_28d_ago adj close_3d_avg adj close_5d_avg adj close_7d_avg adj close_10d_avg adj close_15d_avg adj close_30d_avg open_1d_ago open_3d_ago open_5d_ago open_7d_ago open_14d_ago open_21d_ago open_28d_ago open_3d_avg open_5d_avg open_7d_avg open_10d_avg open_15d_avg open_30d_avg high_1d_ago high_3d_ago high_5d_ago high_7d_ago high_14d_ago high_21d_ago high_28d_ago high_3d_avg high_5d_avg high_7d_avg high_10d_avg high_15d_avg high_30d_avg low_1d_ago low_3d_ago low_5d_ago low_7d_ago low_14d_ago low_21d_ago low_28d_ago low_3d_avg low_5d_avg low_7d_avg low_10d_avg low_15d_avg low_30d_avg volume_1d_ago volume_3d_ago volume_5d_ago volume_7d_ago volume_14d_ago volume_21d_ago volume_28d_ago volume_3d_avg volume_5d_avg volume_7d_avg volume_10d_avg volume_15d_avg volume_30d_avg
852843 2013-01-02 25.840000 26.690001 25.780001 26.670000 21.445908 17862400.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 NaN NaN NaN NaN NaN NaN 0.000000 NaN NaN 26.639999 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
852844 2013-01-03 26.700001 26.770000 26.490000 26.639999 21.421791 9075500.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.670000 NaN NaN NaN NaN NaN 33.904295 NaN NaN 26.740000 26.670000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 21.445908 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 25.840000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 26.690001 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 25.780001 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 17862400.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
852845 2013-01-04 26.700001 26.830000 26.549999 26.740000 21.502203 7696000.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.653333 NaN NaN NaN NaN NaN 48.695375 NaN NaN 26.660000 26.639999 NaN NaN NaN NaN NaN NaN 26.683333 NaN NaN NaN NaN NaN 21.421791 NaN NaN NaN NaN NaN NaN 21.456634 NaN NaN NaN NaN NaN 26.700001 NaN NaN NaN NaN NaN NaN 26.413334 NaN NaN NaN NaN NaN 26.770000 NaN NaN NaN NaN NaN NaN 26.763334 NaN NaN NaN NaN NaN 26.490000 NaN NaN NaN NaN NaN NaN 26.273333 NaN NaN NaN NaN NaN 9075500.0 NaN NaN NaN NaN NaN NaN 1.154463e+07 NaN NaN NaN NaN NaN
852846 2013-01-07 26.620001 26.740000 26.549999 26.660000 21.437866 7576200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.688852 NaN NaN NaN NaN NaN 39.919745 NaN NaN 26.680000 26.740000 26.670000 NaN NaN NaN NaN NaN 26.680000 NaN NaN NaN NaN NaN 21.502203 21.445908 NaN NaN NaN NaN NaN 21.453953 NaN NaN NaN NaN NaN 26.700001 25.840000 NaN NaN NaN NaN NaN 26.673334 NaN NaN NaN NaN NaN 26.830000 26.690001 NaN NaN NaN NaN NaN 26.780000 NaN NaN NaN NaN NaN 26.549999 25.780001 NaN NaN NaN NaN NaN 26.529999 NaN NaN NaN NaN NaN 7696000.0 17862400.0 NaN NaN NaN NaN NaN 8.115900e+06 NaN NaN NaN NaN NaN
852847 2013-01-08 26.520000 26.920000 26.459999 26.680000 21.453959 14360800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.679078 NaN NaN NaN NaN NaN 55.233142 NaN NaN 27.049999 26.660000 26.639999 NaN NaN NaN NaN NaN 26.693333 26.678 NaN NaN NaN NaN 21.437866 21.421791 NaN NaN NaN NaN NaN 21.464676 21.452345 NaN NaN NaN NaN 26.620001 26.700001 NaN NaN NaN NaN NaN 26.613334 26.476001 NaN NaN NaN NaN 26.740000 26.770000 NaN NaN NaN NaN NaN 26.830000 26.79 NaN NaN NaN NaN 26.549999 26.490000 NaN NaN NaN NaN NaN 26.519999 26.366 NaN NaN NaN NaN 7576200.0 9075500.0 NaN NaN NaN NaN NaN 9.877667e+06 11314180.0 NaN NaN NaN NaN

1.d. Remove missing records after feature engineering¶

In [21]:
# Check if the DataFrame contains any missing values
if df.isna().any().any():
    # Remove records with missing values and reset the index
    df = df.dropna().reset_index(drop=True)
    print("Missing records removed. DataFrame is now cleaned.")
else:
    df = df.copy()
    print("No missing records found. DataFrame remains unchanged.")

# Display the first few rows of the cleaned DataFrame
df.head()
Missing records removed. DataFrame is now cleaned.
Out[21]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded ema_9 sma_5 sma_10 sma_15 sma_30 rsi mfi macd macd_signal close_1d_next close_1d_ago close_3d_ago close_5d_ago close_7d_ago close_14d_ago close_21d_ago close_28d_ago close_3d_avg close_5d_avg close_7d_avg close_10d_avg close_15d_avg close_30d_avg adj close_1d_ago adj close_3d_ago adj close_5d_ago adj close_7d_ago adj close_14d_ago adj close_21d_ago adj close_28d_ago adj close_3d_avg adj close_5d_avg adj close_7d_avg adj close_10d_avg adj close_15d_avg adj close_30d_avg open_1d_ago open_3d_ago open_5d_ago open_7d_ago open_14d_ago open_21d_ago open_28d_ago open_3d_avg open_5d_avg open_7d_avg open_10d_avg open_15d_avg open_30d_avg high_1d_ago high_3d_ago high_5d_ago high_7d_ago high_14d_ago high_21d_ago high_28d_ago high_3d_avg high_5d_avg high_7d_avg high_10d_avg high_15d_avg high_30d_avg low_1d_ago low_3d_ago low_5d_ago low_7d_ago low_14d_ago low_21d_ago low_28d_ago low_3d_avg low_5d_avg low_7d_avg low_10d_avg low_15d_avg low_30d_avg volume_1d_ago volume_3d_ago volume_5d_ago volume_7d_ago volume_14d_ago volume_21d_ago volume_28d_ago volume_3d_avg volume_5d_avg volume_7d_avg volume_10d_avg volume_15d_avg volume_30d_avg
0 2013-02-20 27.070000 27.150000 26.950001 27.030001 21.735399 17057200.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.307926 27.136 27.518 27.642000 27.589000 41.633625 53.176274 -0.147786 -0.050945 26.820000 26.959999 26.570000 27.680000 27.76 27.730000 28.080000 27.049999 26.903333 27.006 27.208571 27.426 27.588667 27.601333 21.679117 21.365499 22.258080 22.322405 22.298285 22.579723 21.751484 21.633545 21.716101 21.878994 22.053831 22.184635 22.194819 26.750000 26.690001 27.700001 27.799999 27.830000 27.969999 26.790001 26.886667 27.018 27.217143 27.386 27.553333 27.536667 27.190001 27.020000 27.830000 28.100000 27.980000 28.100000 27.080000 27.136667 27.248000 27.410000 27.618 27.779333 27.754667 26.750000 26.450001 27.270000 27.750000 27.67 27.820000 26.68 26.766667 26.842 27.015714 27.224 27.411333 27.382333 18297500.0 37728900.0 14931000.0 11159200.0 5800400.0 15906900.0 11671400.0 1.904973e+07 21756140.0 1.907480e+07 17005580.0 1.419575e+07 1.352419e+07
1 2013-02-21 26.990000 27.049999 26.639999 26.820000 21.566534 16936600.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.252312 27.006 27.426 27.588667 27.601333 38.257648 47.431888 -0.175230 -0.078792 26.770000 27.030001 26.719999 27.750000 27.75 27.790001 27.559999 27.309999 26.936666 26.820 27.075714 27.308 27.528000 27.606000 21.735399 21.486118 22.314371 22.314371 22.346525 22.161583 21.960548 21.660350 21.566534 21.772160 21.958945 22.135851 22.198572 27.070000 26.840000 27.740000 27.730000 27.650000 27.730000 27.129999 26.936666 26.868 27.111429 27.295 27.497333 27.552333 27.150000 27.070000 27.809999 27.799999 27.950001 28.040001 27.340000 27.130000 27.096000 27.302857 27.510 27.717333 27.759000 26.950001 26.600000 27.459999 27.629999 27.65 27.299999 27.09 26.780000 26.678 26.874286 27.108 27.342667 27.388333 17057200.0 21794500.0 13902600.0 9811900.0 7541300.0 18213200.0 16348500.0 1.743043e+07 22362940.0 2.009261e+07 17608410.0 1.493817e+07 1.361005e+07
2 2013-02-22 26.889999 27.129999 26.730000 26.770000 21.526327 16664800.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.165815 26.820 27.308 27.528000 27.606000 37.478423 48.958416 -0.198438 -0.104970 26.490000 26.820000 26.959999 26.570000 27.68 28.219999 27.790001 27.420000 26.873334 26.860 26.945714 27.181 27.460000 27.596667 21.566534 21.679117 21.365499 22.258080 22.692308 22.346525 22.049007 21.609420 21.598699 21.667624 21.856822 22.081171 22.191067 26.990000 26.750000 26.690001 27.700001 28.000000 27.500000 27.350000 26.983333 26.908 26.995714 27.220 27.446667 27.555667 27.049999 27.190001 27.020000 27.830000 28.320000 27.889999 27.540001 27.109999 27.118000 27.202857 27.415 27.662666 27.760667 26.639999 26.750000 26.450001 27.270000 27.93 27.350000 27.25 26.773333 26.734 26.797143 27.023 27.281333 27.390000 16936600.0 18297500.0 37728900.0 14931000.0 9623100.0 15212300.0 10162600.0 1.688620e+07 18150120.0 2.034030e+07 17828420.0 1.554640e+07 1.377650e+07
3 2013-02-25 26.790001 27.080000 26.480000 26.490000 21.301172 15527100.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 27.086626 26.860 27.181 27.460000 27.596667 33.378362 47.675126 -0.235462 -0.132994 26.950001 26.770000 27.030001 26.719999 27.75 27.879999 27.830000 27.480000 26.693333 26.814 26.765714 27.054 27.344667 27.569333 21.526327 21.735399 21.486118 22.314371 22.418896 22.378695 22.097254 21.464678 21.561710 21.522881 21.754699 21.988429 22.169087 26.889999 27.070000 26.840000 27.740000 28.010000 27.930000 27.459999 26.890000 26.898 26.860000 27.119 27.366000 27.544333 27.129999 27.150000 27.070000 27.809999 28.150000 28.030001 27.520000 27.086666 27.120000 27.098571 27.313 27.580000 27.752000 26.730000 26.950001 26.600000 27.459999 27.83 27.639999 27.17 26.616666 26.710 26.657143 26.896 27.184667 27.369667 16664800.0 17057200.0 21794500.0 13902600.0 8954300.0 14444500.0 8688200.0 1.637617e+07 16896640.0 2.057237e+07 18265210.0 1.594000e+07 1.374912e+07
4 2013-02-26 26.530001 26.980000 26.510000 26.950001 21.671074 13702900.0 MDLZ Mondelez International Consumer Staples Packaged Foods & Meats Chicago, Illinois 2012-10-02 1103982 2012 26.967270 26.814 27.054 27.344667 27.569333 44.181951 48.178912 -0.226841 -0.152855 27.570000 26.490000 26.820000 26.959999 26.57 27.950001 27.780001 27.709999 26.736667 26.812 26.820000 26.974 27.282667 27.553667 21.301172 21.566534 21.679117 21.365499 22.475189 22.338484 22.282200 21.499524 21.560101 21.566535 21.690369 21.938574 22.156490 26.790001 26.990000 26.750000 26.690001 27.950001 27.830000 27.580000 26.736667 26.854 26.837143 26.999 27.267333 27.517000 27.080000 27.049999 27.190001 27.020000 28.110001 27.889999 27.740000 27.063333 27.077999 27.092857 27.231 27.502000 27.733333 26.480000 26.639999 26.750000 26.450001 27.85 27.690001 27.34 26.573333 26.662 26.665714 26.784 27.096667 27.345000 15527100.0 16936600.0 18297500.0 37728900.0 10961400.0 12066800.0 9863200.0 1.529827e+07 15977720.0 1.714009e+07 18654310.0 1.625657e+07 1.386713e+07

1.e. Split Data into Train ( before Covid ) and Test ( after Covid )¶

In [22]:
# # Calculate the index for the 70-30 split
# split_index = int(0.7 * len(df))

# # Split the DataFrame into training and testing sets
# train_df = df.iloc[:split_index]
# test_df = df.iloc[split_index:]

# Split the DataFrame into training and testing sets
train_df = df[df.date.dt.year<2020]
test_df = df[df.date.dt.year>=2020]



print(f"Train days: {len(train_df)}, Test days: {len(test_df)}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.date, y=train_df.close_1d_next, name='Training'))
fig.add_trace(go.Scatter(x=test_df.date,  y=test_df.close_1d_next,  name='Test'))
fig.show()
Train days: 1729, Test days: 970
In [23]:
drop_cols1 = ['date','open','high','low','close','adj close','volume','symbol','security',
 'gics sector','gics sub-industry','headquarters location','date added','cik','founded']

train_df = train_df.drop(drop_cols1, 1)

test_df  = test_df.drop(drop_cols1, 1)

# target column is next day's close price
y_train = train_df['close_1d_next'].copy()
X_train = train_df.drop(['close_1d_next'], 1)

# target column is next day's close price
y_test  = test_df['close_1d_next'].copy()
X_test  = test_df.drop(['close_1d_next'], 1)
In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [25]:
X_train.shape, X_train_scaled.shape, X_test.shape, X_test_scaled.shape, 
Out[25]:
((1729, 87), (1729, 87), (970, 87), (970, 87))

2. Train multiple regression models¶

In [26]:
def train_and_evaluate_models(X_train_scaled,y_train,X_test_scaled,y_test):
    """
    Train and evaluate multiple regression models on a given dataframe.

    Parameters:
    - dataframe: Pandas DataFrame containing the dataset.
    - target_column: Name of the target column (dependent variable).
    - features_columns: List of column names used as features (independent variables).

    Returns:
    - A DataFrame containing evaluation metrics for each model.
    """

    # Split the data into features (X) and target variable (y)
#     X = dataframe[features_columns]
#     y = dataframe[target_column]

    # Split the data into training and testing sets (70-30 split)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    
#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)


    # Initialize the models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Elastic Net': ElasticNet(),
        'SVR': SVR(),
        'K-Neighbors Regressor': KNeighborsRegressor(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'CatBoost': CatBoostRegressor()
    }

    # Initialize a DataFrame to store the evaluation metrics
    metrics_df = pd.DataFrame(columns=['Model', 'Mean Squared Error', 'Mean Absolute Error', 'R2 Score'])

    # Train and evaluate each model
    for model_name, model in models.items():
        start_time = time.time() 
       
    # Train the model
        model.fit(X_train_scaled, y_train)
        
        end_time = time.time()  # Record the end time

        training_time = end_time - start_time

        # Make predictions
        y_pred = model.predict(X_test_scaled)

        # Evaluate the model
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store the metrics in the DataFrame
        metrics_df = metrics_df.append({
            'Model': model_name,
            'Mean Squared Error': mse,
            'Mean Absolute Error': mae,
            'R2 Score': r2,
            'Training Time (s)': training_time
        }, ignore_index=True)
        metrics_df = metrics_df.sort_values(by=['R2 Score'],ascending=False)
    return metrics_df
In [27]:
df_compare = train_and_evaluate_models(X_train,y_train,X_test,y_test)
Learning rate set to 0.044643
0:	learn: 6.0603979	total: 60.8ms	remaining: 1m
1:	learn: 5.8169612	total: 64.5ms	remaining: 32.2s
2:	learn: 5.5755845	total: 68.1ms	remaining: 22.6s
3:	learn: 5.3462380	total: 71.7ms	remaining: 17.9s
4:	learn: 5.1261132	total: 75.4ms	remaining: 15s
5:	learn: 4.9208602	total: 79.1ms	remaining: 13.1s
6:	learn: 4.7250714	total: 83.1ms	remaining: 11.8s
7:	learn: 4.5456470	total: 86.6ms	remaining: 10.7s
8:	learn: 4.3606091	total: 90.2ms	remaining: 9.94s
9:	learn: 4.1830096	total: 93.9ms	remaining: 9.29s
10:	learn: 4.0166823	total: 97.7ms	remaining: 8.79s
11:	learn: 3.8657076	total: 102ms	remaining: 8.36s
12:	learn: 3.7140111	total: 105ms	remaining: 8.01s
13:	learn: 3.5654097	total: 109ms	remaining: 7.7s
14:	learn: 3.4264460	total: 113ms	remaining: 7.42s
15:	learn: 3.2905647	total: 117ms	remaining: 7.18s
16:	learn: 3.1619168	total: 121ms	remaining: 6.97s
17:	learn: 3.0386276	total: 124ms	remaining: 6.77s
18:	learn: 2.9250019	total: 128ms	remaining: 6.6s
19:	learn: 2.8155633	total: 132ms	remaining: 6.46s
20:	learn: 2.7067078	total: 136ms	remaining: 6.32s
21:	learn: 2.6104041	total: 139ms	remaining: 6.19s
22:	learn: 2.5171443	total: 143ms	remaining: 6.07s
23:	learn: 2.4234694	total: 147ms	remaining: 5.96s
24:	learn: 2.3336920	total: 150ms	remaining: 5.86s
25:	learn: 2.2501270	total: 154ms	remaining: 5.77s
26:	learn: 2.1685983	total: 158ms	remaining: 5.68s
27:	learn: 2.0914990	total: 161ms	remaining: 5.6s
28:	learn: 2.0188179	total: 165ms	remaining: 5.53s
29:	learn: 1.9490958	total: 169ms	remaining: 5.45s
30:	learn: 1.8819748	total: 172ms	remaining: 5.38s
31:	learn: 1.8183164	total: 176ms	remaining: 5.32s
32:	learn: 1.7609789	total: 180ms	remaining: 5.26s
33:	learn: 1.7008286	total: 183ms	remaining: 5.2s
34:	learn: 1.6438044	total: 186ms	remaining: 5.13s
35:	learn: 1.5954289	total: 190ms	remaining: 5.08s
36:	learn: 1.5458456	total: 193ms	remaining: 5.02s
37:	learn: 1.4999957	total: 197ms	remaining: 4.97s
38:	learn: 1.4557749	total: 200ms	remaining: 4.93s
39:	learn: 1.4136122	total: 203ms	remaining: 4.88s
40:	learn: 1.3740245	total: 207ms	remaining: 4.84s
41:	learn: 1.3360847	total: 210ms	remaining: 4.79s
42:	learn: 1.3006229	total: 213ms	remaining: 4.75s
43:	learn: 1.2683312	total: 217ms	remaining: 4.71s
44:	learn: 1.2328651	total: 220ms	remaining: 4.67s
45:	learn: 1.2022101	total: 223ms	remaining: 4.63s
46:	learn: 1.1716981	total: 227ms	remaining: 4.6s
47:	learn: 1.1425806	total: 230ms	remaining: 4.56s
48:	learn: 1.1126613	total: 234ms	remaining: 4.53s
49:	learn: 1.0845503	total: 237ms	remaining: 4.5s
50:	learn: 1.0590177	total: 240ms	remaining: 4.47s
51:	learn: 1.0347707	total: 244ms	remaining: 4.44s
52:	learn: 1.0113231	total: 247ms	remaining: 4.42s
53:	learn: 0.9886945	total: 251ms	remaining: 4.39s
54:	learn: 0.9669580	total: 254ms	remaining: 4.36s
55:	learn: 0.9468328	total: 257ms	remaining: 4.34s
56:	learn: 0.9270737	total: 261ms	remaining: 4.31s
57:	learn: 0.9100334	total: 264ms	remaining: 4.29s
58:	learn: 0.8917612	total: 268ms	remaining: 4.27s
59:	learn: 0.8750815	total: 271ms	remaining: 4.24s
60:	learn: 0.8601060	total: 274ms	remaining: 4.22s
61:	learn: 0.8462298	total: 277ms	remaining: 4.2s
62:	learn: 0.8345633	total: 281ms	remaining: 4.18s
63:	learn: 0.8216135	total: 284ms	remaining: 4.16s
64:	learn: 0.8083116	total: 288ms	remaining: 4.14s
65:	learn: 0.7956239	total: 292ms	remaining: 4.13s
66:	learn: 0.7843834	total: 295ms	remaining: 4.11s
67:	learn: 0.7740919	total: 298ms	remaining: 4.09s
68:	learn: 0.7643108	total: 301ms	remaining: 4.07s
69:	learn: 0.7561664	total: 305ms	remaining: 4.05s
70:	learn: 0.7491327	total: 308ms	remaining: 4.03s
71:	learn: 0.7411352	total: 311ms	remaining: 4.01s
72:	learn: 0.7330505	total: 315ms	remaining: 4s
73:	learn: 0.7269962	total: 318ms	remaining: 3.98s
74:	learn: 0.7192950	total: 321ms	remaining: 3.96s
75:	learn: 0.7126213	total: 324ms	remaining: 3.94s
76:	learn: 0.7067352	total: 328ms	remaining: 3.93s
77:	learn: 0.7006922	total: 331ms	remaining: 3.91s
78:	learn: 0.6944183	total: 335ms	remaining: 3.9s
79:	learn: 0.6885217	total: 338ms	remaining: 3.89s
80:	learn: 0.6825927	total: 341ms	remaining: 3.87s
81:	learn: 0.6783627	total: 345ms	remaining: 3.86s
82:	learn: 0.6741644	total: 348ms	remaining: 3.84s
83:	learn: 0.6696567	total: 351ms	remaining: 3.83s
84:	learn: 0.6646154	total: 355ms	remaining: 3.82s
85:	learn: 0.6599463	total: 358ms	remaining: 3.81s
86:	learn: 0.6560602	total: 362ms	remaining: 3.8s
87:	learn: 0.6536929	total: 365ms	remaining: 3.78s
88:	learn: 0.6496752	total: 369ms	remaining: 3.77s
89:	learn: 0.6462499	total: 372ms	remaining: 3.76s
90:	learn: 0.6428735	total: 375ms	remaining: 3.75s
91:	learn: 0.6397495	total: 379ms	remaining: 3.74s
92:	learn: 0.6367543	total: 382ms	remaining: 3.72s
93:	learn: 0.6337726	total: 385ms	remaining: 3.71s
94:	learn: 0.6308954	total: 389ms	remaining: 3.7s
95:	learn: 0.6280612	total: 392ms	remaining: 3.69s
96:	learn: 0.6250764	total: 396ms	remaining: 3.68s
97:	learn: 0.6223572	total: 399ms	remaining: 3.67s
98:	learn: 0.6205398	total: 402ms	remaining: 3.66s
99:	learn: 0.6186628	total: 406ms	remaining: 3.65s
100:	learn: 0.6154184	total: 409ms	remaining: 3.64s
101:	learn: 0.6124974	total: 412ms	remaining: 3.63s
102:	learn: 0.6095369	total: 416ms	remaining: 3.62s
103:	learn: 0.6081161	total: 419ms	remaining: 3.61s
104:	learn: 0.6061648	total: 422ms	remaining: 3.6s
105:	learn: 0.6037361	total: 426ms	remaining: 3.59s
106:	learn: 0.6018697	total: 429ms	remaining: 3.58s
107:	learn: 0.5996563	total: 433ms	remaining: 3.57s
108:	learn: 0.5987782	total: 436ms	remaining: 3.56s
109:	learn: 0.5962953	total: 439ms	remaining: 3.55s
110:	learn: 0.5944525	total: 443ms	remaining: 3.55s
111:	learn: 0.5925792	total: 446ms	remaining: 3.54s
112:	learn: 0.5908725	total: 450ms	remaining: 3.53s
113:	learn: 0.5892615	total: 453ms	remaining: 3.52s
114:	learn: 0.5876082	total: 456ms	remaining: 3.51s
115:	learn: 0.5856574	total: 460ms	remaining: 3.51s
116:	learn: 0.5839197	total: 464ms	remaining: 3.5s
117:	learn: 0.5821181	total: 467ms	remaining: 3.49s
118:	learn: 0.5802687	total: 471ms	remaining: 3.48s
119:	learn: 0.5783092	total: 474ms	remaining: 3.48s
120:	learn: 0.5768204	total: 477ms	remaining: 3.47s
121:	learn: 0.5753784	total: 481ms	remaining: 3.46s
122:	learn: 0.5739414	total: 484ms	remaining: 3.45s
123:	learn: 0.5731172	total: 487ms	remaining: 3.44s
124:	learn: 0.5717661	total: 491ms	remaining: 3.44s
125:	learn: 0.5699490	total: 494ms	remaining: 3.43s
126:	learn: 0.5692480	total: 497ms	remaining: 3.42s
127:	learn: 0.5683714	total: 501ms	remaining: 3.41s
128:	learn: 0.5669671	total: 504ms	remaining: 3.4s
129:	learn: 0.5661305	total: 507ms	remaining: 3.4s
130:	learn: 0.5648214	total: 511ms	remaining: 3.39s
131:	learn: 0.5632118	total: 514ms	remaining: 3.38s
132:	learn: 0.5621830	total: 518ms	remaining: 3.37s
133:	learn: 0.5614073	total: 521ms	remaining: 3.37s
134:	learn: 0.5607392	total: 524ms	remaining: 3.36s
135:	learn: 0.5592399	total: 528ms	remaining: 3.35s
136:	learn: 0.5582825	total: 532ms	remaining: 3.35s
137:	learn: 0.5570203	total: 535ms	remaining: 3.34s
138:	learn: 0.5555467	total: 539ms	remaining: 3.33s
139:	learn: 0.5544866	total: 542ms	remaining: 3.33s
140:	learn: 0.5534618	total: 546ms	remaining: 3.32s
141:	learn: 0.5527745	total: 549ms	remaining: 3.32s
142:	learn: 0.5517276	total: 553ms	remaining: 3.31s
143:	learn: 0.5511754	total: 556ms	remaining: 3.31s
144:	learn: 0.5499848	total: 560ms	remaining: 3.3s
145:	learn: 0.5492370	total: 563ms	remaining: 3.29s
146:	learn: 0.5480831	total: 566ms	remaining: 3.29s
147:	learn: 0.5470207	total: 570ms	remaining: 3.28s
148:	learn: 0.5461768	total: 574ms	remaining: 3.27s
149:	learn: 0.5452187	total: 577ms	remaining: 3.27s
150:	learn: 0.5442155	total: 580ms	remaining: 3.26s
151:	learn: 0.5431025	total: 584ms	remaining: 3.26s
152:	learn: 0.5423642	total: 587ms	remaining: 3.25s
153:	learn: 0.5414228	total: 591ms	remaining: 3.25s
154:	learn: 0.5402482	total: 594ms	remaining: 3.24s
155:	learn: 0.5390607	total: 598ms	remaining: 3.23s
156:	learn: 0.5383473	total: 602ms	remaining: 3.23s
157:	learn: 0.5373131	total: 606ms	remaining: 3.23s
158:	learn: 0.5364392	total: 609ms	remaining: 3.22s
159:	learn: 0.5357421	total: 614ms	remaining: 3.22s
160:	learn: 0.5343277	total: 617ms	remaining: 3.22s
161:	learn: 0.5337785	total: 621ms	remaining: 3.21s
162:	learn: 0.5329012	total: 624ms	remaining: 3.21s
163:	learn: 0.5317662	total: 628ms	remaining: 3.2s
164:	learn: 0.5305788	total: 632ms	remaining: 3.2s
165:	learn: 0.5291670	total: 635ms	remaining: 3.19s
166:	learn: 0.5279711	total: 639ms	remaining: 3.19s
167:	learn: 0.5263201	total: 643ms	remaining: 3.18s
168:	learn: 0.5250413	total: 646ms	remaining: 3.18s
169:	learn: 0.5239617	total: 649ms	remaining: 3.17s
170:	learn: 0.5232086	total: 653ms	remaining: 3.16s
171:	learn: 0.5223393	total: 656ms	remaining: 3.16s
172:	learn: 0.5213785	total: 659ms	remaining: 3.15s
173:	learn: 0.5205693	total: 663ms	remaining: 3.15s
174:	learn: 0.5195080	total: 666ms	remaining: 3.14s
175:	learn: 0.5184230	total: 670ms	remaining: 3.13s
176:	learn: 0.5179592	total: 673ms	remaining: 3.13s
177:	learn: 0.5170185	total: 676ms	remaining: 3.12s
178:	learn: 0.5158612	total: 680ms	remaining: 3.12s
179:	learn: 0.5145391	total: 683ms	remaining: 3.11s
180:	learn: 0.5134332	total: 686ms	remaining: 3.11s
181:	learn: 0.5123862	total: 690ms	remaining: 3.1s
182:	learn: 0.5109381	total: 694ms	remaining: 3.1s
183:	learn: 0.5094690	total: 697ms	remaining: 3.09s
184:	learn: 0.5087636	total: 701ms	remaining: 3.09s
185:	learn: 0.5082193	total: 704ms	remaining: 3.08s
186:	learn: 0.5077741	total: 707ms	remaining: 3.07s
187:	learn: 0.5064784	total: 711ms	remaining: 3.07s
188:	learn: 0.5053126	total: 714ms	remaining: 3.06s
189:	learn: 0.5045861	total: 718ms	remaining: 3.06s
190:	learn: 0.5035784	total: 721ms	remaining: 3.05s
191:	learn: 0.5025377	total: 724ms	remaining: 3.05s
192:	learn: 0.5017748	total: 728ms	remaining: 3.04s
193:	learn: 0.5008812	total: 732ms	remaining: 3.04s
194:	learn: 0.5004256	total: 735ms	remaining: 3.03s
195:	learn: 0.4994655	total: 738ms	remaining: 3.03s
196:	learn: 0.4989930	total: 742ms	remaining: 3.02s
197:	learn: 0.4980951	total: 745ms	remaining: 3.02s
198:	learn: 0.4973099	total: 748ms	remaining: 3.01s
199:	learn: 0.4961640	total: 752ms	remaining: 3.01s
200:	learn: 0.4946290	total: 755ms	remaining: 3s
201:	learn: 0.4940315	total: 758ms	remaining: 3s
202:	learn: 0.4931395	total: 762ms	remaining: 2.99s
203:	learn: 0.4920884	total: 765ms	remaining: 2.98s
204:	learn: 0.4909474	total: 769ms	remaining: 2.98s
205:	learn: 0.4899796	total: 773ms	remaining: 2.98s
206:	learn: 0.4890645	total: 776ms	remaining: 2.97s
207:	learn: 0.4882374	total: 779ms	remaining: 2.97s
208:	learn: 0.4871172	total: 783ms	remaining: 2.96s
209:	learn: 0.4861811	total: 786ms	remaining: 2.96s
210:	learn: 0.4850126	total: 790ms	remaining: 2.95s
211:	learn: 0.4837164	total: 793ms	remaining: 2.95s
212:	learn: 0.4829656	total: 797ms	remaining: 2.94s
213:	learn: 0.4819662	total: 800ms	remaining: 2.94s
214:	learn: 0.4817394	total: 803ms	remaining: 2.93s
215:	learn: 0.4811177	total: 807ms	remaining: 2.93s
216:	learn: 0.4806224	total: 810ms	remaining: 2.92s
217:	learn: 0.4800808	total: 813ms	remaining: 2.92s
218:	learn: 0.4789126	total: 817ms	remaining: 2.91s
219:	learn: 0.4784037	total: 820ms	remaining: 2.91s
220:	learn: 0.4774369	total: 823ms	remaining: 2.9s
221:	learn: 0.4764525	total: 827ms	remaining: 2.9s
222:	learn: 0.4757860	total: 830ms	remaining: 2.89s
223:	learn: 0.4748362	total: 833ms	remaining: 2.89s
224:	learn: 0.4738011	total: 837ms	remaining: 2.88s
225:	learn: 0.4732533	total: 840ms	remaining: 2.88s
226:	learn: 0.4724302	total: 844ms	remaining: 2.87s
227:	learn: 0.4714185	total: 847ms	remaining: 2.87s
228:	learn: 0.4704901	total: 850ms	remaining: 2.86s
229:	learn: 0.4696393	total: 854ms	remaining: 2.86s
230:	learn: 0.4688642	total: 857ms	remaining: 2.85s
231:	learn: 0.4684056	total: 861ms	remaining: 2.85s
232:	learn: 0.4680263	total: 864ms	remaining: 2.84s
233:	learn: 0.4673195	total: 867ms	remaining: 2.84s
234:	learn: 0.4661429	total: 871ms	remaining: 2.83s
235:	learn: 0.4651517	total: 874ms	remaining: 2.83s
236:	learn: 0.4643784	total: 877ms	remaining: 2.82s
237:	learn: 0.4639422	total: 881ms	remaining: 2.82s
238:	learn: 0.4629527	total: 884ms	remaining: 2.81s
239:	learn: 0.4621494	total: 888ms	remaining: 2.81s
240:	learn: 0.4611603	total: 891ms	remaining: 2.81s
241:	learn: 0.4603838	total: 894ms	remaining: 2.8s
242:	learn: 0.4593452	total: 898ms	remaining: 2.8s
243:	learn: 0.4586434	total: 901ms	remaining: 2.79s
244:	learn: 0.4577788	total: 905ms	remaining: 2.79s
245:	learn: 0.4576161	total: 908ms	remaining: 2.78s
246:	learn: 0.4570526	total: 911ms	remaining: 2.78s
247:	learn: 0.4560632	total: 915ms	remaining: 2.77s
248:	learn: 0.4552049	total: 918ms	remaining: 2.77s
249:	learn: 0.4544847	total: 922ms	remaining: 2.77s
250:	learn: 0.4536628	total: 925ms	remaining: 2.76s
251:	learn: 0.4529050	total: 928ms	remaining: 2.75s
252:	learn: 0.4521824	total: 932ms	remaining: 2.75s
253:	learn: 0.4515072	total: 935ms	remaining: 2.75s
254:	learn: 0.4506698	total: 939ms	remaining: 2.74s
255:	learn: 0.4501205	total: 942ms	remaining: 2.74s
256:	learn: 0.4493848	total: 945ms	remaining: 2.73s
257:	learn: 0.4486159	total: 949ms	remaining: 2.73s
258:	learn: 0.4482442	total: 953ms	remaining: 2.73s
259:	learn: 0.4478827	total: 956ms	remaining: 2.72s
260:	learn: 0.4473978	total: 959ms	remaining: 2.71s
261:	learn: 0.4468994	total: 962ms	remaining: 2.71s
262:	learn: 0.4456701	total: 966ms	remaining: 2.71s
263:	learn: 0.4449110	total: 969ms	remaining: 2.7s
264:	learn: 0.4442542	total: 973ms	remaining: 2.7s
265:	learn: 0.4431941	total: 976ms	remaining: 2.69s
266:	learn: 0.4425103	total: 980ms	remaining: 2.69s
267:	learn: 0.4416661	total: 984ms	remaining: 2.69s
268:	learn: 0.4411630	total: 987ms	remaining: 2.68s
269:	learn: 0.4404441	total: 991ms	remaining: 2.68s
270:	learn: 0.4400363	total: 994ms	remaining: 2.67s
271:	learn: 0.4398475	total: 998ms	remaining: 2.67s
272:	learn: 0.4390363	total: 1s	remaining: 2.67s
273:	learn: 0.4381161	total: 1s	remaining: 2.66s
274:	learn: 0.4374399	total: 1.01s	remaining: 2.66s
275:	learn: 0.4370343	total: 1.01s	remaining: 2.65s
276:	learn: 0.4366440	total: 1.01s	remaining: 2.65s
277:	learn: 0.4363497	total: 1.02s	remaining: 2.64s
278:	learn: 0.4360749	total: 1.02s	remaining: 2.64s
279:	learn: 0.4353030	total: 1.02s	remaining: 2.63s
280:	learn: 0.4346082	total: 1.03s	remaining: 2.63s
281:	learn: 0.4342305	total: 1.03s	remaining: 2.63s
282:	learn: 0.4338120	total: 1.03s	remaining: 2.62s
283:	learn: 0.4336984	total: 1.04s	remaining: 2.62s
284:	learn: 0.4329844	total: 1.04s	remaining: 2.61s
285:	learn: 0.4325585	total: 1.04s	remaining: 2.61s
286:	learn: 0.4316236	total: 1.05s	remaining: 2.6s
287:	learn: 0.4312618	total: 1.05s	remaining: 2.6s
288:	learn: 0.4305788	total: 1.05s	remaining: 2.6s
289:	learn: 0.4299323	total: 1.06s	remaining: 2.59s
290:	learn: 0.4294147	total: 1.06s	remaining: 2.59s
291:	learn: 0.4287311	total: 1.06s	remaining: 2.58s
292:	learn: 0.4281021	total: 1.07s	remaining: 2.58s
293:	learn: 0.4274824	total: 1.07s	remaining: 2.58s
294:	learn: 0.4266494	total: 1.08s	remaining: 2.57s
295:	learn: 0.4259133	total: 1.08s	remaining: 2.57s
296:	learn: 0.4254208	total: 1.08s	remaining: 2.56s
297:	learn: 0.4248581	total: 1.09s	remaining: 2.56s
298:	learn: 0.4244197	total: 1.09s	remaining: 2.55s
299:	learn: 0.4238834	total: 1.09s	remaining: 2.55s
300:	learn: 0.4235753	total: 1.1s	remaining: 2.55s
301:	learn: 0.4230278	total: 1.1s	remaining: 2.54s
302:	learn: 0.4223168	total: 1.1s	remaining: 2.54s
303:	learn: 0.4218202	total: 1.11s	remaining: 2.53s
304:	learn: 0.4213324	total: 1.11s	remaining: 2.53s
305:	learn: 0.4209713	total: 1.11s	remaining: 2.52s
306:	learn: 0.4206682	total: 1.12s	remaining: 2.52s
307:	learn: 0.4205730	total: 1.12s	remaining: 2.52s
308:	learn: 0.4200366	total: 1.12s	remaining: 2.51s
309:	learn: 0.4196622	total: 1.13s	remaining: 2.51s
310:	learn: 0.4188254	total: 1.13s	remaining: 2.5s
311:	learn: 0.4182222	total: 1.13s	remaining: 2.5s
312:	learn: 0.4181272	total: 1.14s	remaining: 2.49s
313:	learn: 0.4178863	total: 1.14s	remaining: 2.49s
314:	learn: 0.4171997	total: 1.14s	remaining: 2.48s
315:	learn: 0.4165719	total: 1.15s	remaining: 2.48s
316:	learn: 0.4158275	total: 1.15s	remaining: 2.48s
317:	learn: 0.4150223	total: 1.15s	remaining: 2.47s
318:	learn: 0.4144227	total: 1.16s	remaining: 2.47s
319:	learn: 0.4138124	total: 1.16s	remaining: 2.46s
320:	learn: 0.4135332	total: 1.16s	remaining: 2.46s
321:	learn: 0.4129018	total: 1.17s	remaining: 2.46s
322:	learn: 0.4123695	total: 1.17s	remaining: 2.45s
323:	learn: 0.4116624	total: 1.17s	remaining: 2.45s
324:	learn: 0.4108634	total: 1.18s	remaining: 2.44s
325:	learn: 0.4103299	total: 1.18s	remaining: 2.44s
326:	learn: 0.4096449	total: 1.18s	remaining: 2.44s
327:	learn: 0.4092467	total: 1.19s	remaining: 2.43s
328:	learn: 0.4090363	total: 1.19s	remaining: 2.43s
329:	learn: 0.4080857	total: 1.2s	remaining: 2.43s
330:	learn: 0.4077160	total: 1.2s	remaining: 2.42s
331:	learn: 0.4071226	total: 1.2s	remaining: 2.42s
332:	learn: 0.4067749	total: 1.21s	remaining: 2.42s
333:	learn: 0.4062413	total: 1.21s	remaining: 2.41s
334:	learn: 0.4055695	total: 1.21s	remaining: 2.41s
335:	learn: 0.4054942	total: 1.22s	remaining: 2.4s
336:	learn: 0.4047835	total: 1.22s	remaining: 2.4s
337:	learn: 0.4042196	total: 1.22s	remaining: 2.4s
338:	learn: 0.4038735	total: 1.23s	remaining: 2.39s
339:	learn: 0.4036386	total: 1.23s	remaining: 2.39s
340:	learn: 0.4033409	total: 1.24s	remaining: 2.39s
341:	learn: 0.4026643	total: 1.24s	remaining: 2.38s
342:	learn: 0.4023055	total: 1.24s	remaining: 2.38s
343:	learn: 0.4021658	total: 1.25s	remaining: 2.38s
344:	learn: 0.4018412	total: 1.25s	remaining: 2.37s
345:	learn: 0.4013429	total: 1.25s	remaining: 2.37s
346:	learn: 0.4008246	total: 1.26s	remaining: 2.37s
347:	learn: 0.4002585	total: 1.26s	remaining: 2.36s
348:	learn: 0.3998110	total: 1.26s	remaining: 2.36s
349:	learn: 0.3990401	total: 1.27s	remaining: 2.35s
350:	learn: 0.3982868	total: 1.27s	remaining: 2.35s
351:	learn: 0.3977941	total: 1.27s	remaining: 2.35s
352:	learn: 0.3971540	total: 1.28s	remaining: 2.34s
353:	learn: 0.3966626	total: 1.28s	remaining: 2.34s
354:	learn: 0.3965989	total: 1.28s	remaining: 2.33s
355:	learn: 0.3962189	total: 1.29s	remaining: 2.33s
356:	learn: 0.3958068	total: 1.29s	remaining: 2.33s
357:	learn: 0.3953095	total: 1.29s	remaining: 2.32s
358:	learn: 0.3948418	total: 1.3s	remaining: 2.32s
359:	learn: 0.3946123	total: 1.3s	remaining: 2.31s
360:	learn: 0.3938100	total: 1.3s	remaining: 2.31s
361:	learn: 0.3932202	total: 1.31s	remaining: 2.31s
362:	learn: 0.3925485	total: 1.31s	remaining: 2.3s
363:	learn: 0.3918346	total: 1.31s	remaining: 2.3s
364:	learn: 0.3915180	total: 1.32s	remaining: 2.29s
365:	learn: 0.3907133	total: 1.32s	remaining: 2.29s
366:	learn: 0.3905787	total: 1.32s	remaining: 2.29s
367:	learn: 0.3900100	total: 1.33s	remaining: 2.28s
368:	learn: 0.3894581	total: 1.33s	remaining: 2.28s
369:	learn: 0.3887745	total: 1.33s	remaining: 2.27s
370:	learn: 0.3881815	total: 1.34s	remaining: 2.27s
371:	learn: 0.3877839	total: 1.34s	remaining: 2.27s
372:	learn: 0.3872649	total: 1.34s	remaining: 2.26s
373:	learn: 0.3864811	total: 1.35s	remaining: 2.26s
374:	learn: 0.3859586	total: 1.35s	remaining: 2.25s
375:	learn: 0.3857552	total: 1.35s	remaining: 2.25s
376:	learn: 0.3851249	total: 1.36s	remaining: 2.25s
377:	learn: 0.3846241	total: 1.36s	remaining: 2.24s
378:	learn: 0.3842221	total: 1.36s	remaining: 2.24s
379:	learn: 0.3840501	total: 1.37s	remaining: 2.23s
380:	learn: 0.3839565	total: 1.37s	remaining: 2.23s
381:	learn: 0.3834811	total: 1.38s	remaining: 2.23s
382:	learn: 0.3829302	total: 1.38s	remaining: 2.22s
383:	learn: 0.3824233	total: 1.38s	remaining: 2.22s
384:	learn: 0.3820862	total: 1.39s	remaining: 2.21s
385:	learn: 0.3817059	total: 1.39s	remaining: 2.21s
386:	learn: 0.3810940	total: 1.39s	remaining: 2.21s
387:	learn: 0.3807112	total: 1.4s	remaining: 2.2s
388:	learn: 0.3802605	total: 1.4s	remaining: 2.2s
389:	learn: 0.3798039	total: 1.4s	remaining: 2.19s
390:	learn: 0.3793238	total: 1.41s	remaining: 2.19s
391:	learn: 0.3787842	total: 1.41s	remaining: 2.19s
392:	learn: 0.3783469	total: 1.41s	remaining: 2.18s
393:	learn: 0.3774757	total: 1.42s	remaining: 2.18s
394:	learn: 0.3770911	total: 1.42s	remaining: 2.18s
395:	learn: 0.3767255	total: 1.42s	remaining: 2.17s
396:	learn: 0.3761264	total: 1.43s	remaining: 2.17s
397:	learn: 0.3757592	total: 1.43s	remaining: 2.16s
398:	learn: 0.3751453	total: 1.43s	remaining: 2.16s
399:	learn: 0.3748033	total: 1.44s	remaining: 2.16s
400:	learn: 0.3743184	total: 1.44s	remaining: 2.15s
401:	learn: 0.3740077	total: 1.44s	remaining: 2.15s
402:	learn: 0.3733825	total: 1.45s	remaining: 2.14s
403:	learn: 0.3728581	total: 1.45s	remaining: 2.14s
404:	learn: 0.3722381	total: 1.45s	remaining: 2.14s
405:	learn: 0.3716600	total: 1.46s	remaining: 2.13s
406:	learn: 0.3709735	total: 1.46s	remaining: 2.13s
407:	learn: 0.3706292	total: 1.46s	remaining: 2.12s
408:	learn: 0.3700405	total: 1.47s	remaining: 2.12s
409:	learn: 0.3695724	total: 1.47s	remaining: 2.12s
410:	learn: 0.3688604	total: 1.47s	remaining: 2.11s
411:	learn: 0.3687772	total: 1.48s	remaining: 2.11s
412:	learn: 0.3680599	total: 1.48s	remaining: 2.1s
413:	learn: 0.3676867	total: 1.48s	remaining: 2.1s
414:	learn: 0.3673532	total: 1.49s	remaining: 2.1s
415:	learn: 0.3667493	total: 1.49s	remaining: 2.09s
416:	learn: 0.3663581	total: 1.5s	remaining: 2.09s
417:	learn: 0.3659273	total: 1.5s	remaining: 2.09s
418:	learn: 0.3653585	total: 1.5s	remaining: 2.08s
419:	learn: 0.3647104	total: 1.5s	remaining: 2.08s
420:	learn: 0.3643467	total: 1.51s	remaining: 2.07s
421:	learn: 0.3638706	total: 1.51s	remaining: 2.07s
422:	learn: 0.3637044	total: 1.51s	remaining: 2.07s
423:	learn: 0.3630532	total: 1.52s	remaining: 2.06s
424:	learn: 0.3628976	total: 1.52s	remaining: 2.06s
425:	learn: 0.3622257	total: 1.52s	remaining: 2.06s
426:	learn: 0.3618914	total: 1.53s	remaining: 2.05s
427:	learn: 0.3614434	total: 1.53s	remaining: 2.05s
428:	learn: 0.3610056	total: 1.53s	remaining: 2.04s
429:	learn: 0.3603069	total: 1.54s	remaining: 2.04s
430:	learn: 0.3599433	total: 1.54s	remaining: 2.04s
431:	learn: 0.3598531	total: 1.54s	remaining: 2.03s
432:	learn: 0.3597998	total: 1.55s	remaining: 2.03s
433:	learn: 0.3592574	total: 1.55s	remaining: 2.02s
434:	learn: 0.3586937	total: 1.55s	remaining: 2.02s
435:	learn: 0.3582985	total: 1.56s	remaining: 2.02s
436:	learn: 0.3579804	total: 1.56s	remaining: 2.01s
437:	learn: 0.3576664	total: 1.56s	remaining: 2.01s
438:	learn: 0.3572953	total: 1.57s	remaining: 2s
439:	learn: 0.3568969	total: 1.57s	remaining: 2s
440:	learn: 0.3562306	total: 1.58s	remaining: 2s
441:	learn: 0.3555624	total: 1.58s	remaining: 1.99s
442:	learn: 0.3550610	total: 1.58s	remaining: 1.99s
443:	learn: 0.3545855	total: 1.59s	remaining: 1.99s
444:	learn: 0.3541245	total: 1.59s	remaining: 1.98s
445:	learn: 0.3537684	total: 1.59s	remaining: 1.98s
446:	learn: 0.3534726	total: 1.6s	remaining: 1.97s
447:	learn: 0.3533520	total: 1.6s	remaining: 1.97s
448:	learn: 0.3530596	total: 1.6s	remaining: 1.97s
449:	learn: 0.3525138	total: 1.61s	remaining: 1.96s
450:	learn: 0.3520520	total: 1.61s	remaining: 1.96s
451:	learn: 0.3517294	total: 1.61s	remaining: 1.96s
452:	learn: 0.3516567	total: 1.62s	remaining: 1.95s
453:	learn: 0.3513043	total: 1.62s	remaining: 1.95s
454:	learn: 0.3509406	total: 1.62s	remaining: 1.94s
455:	learn: 0.3504919	total: 1.63s	remaining: 1.94s
456:	learn: 0.3498914	total: 1.63s	remaining: 1.94s
457:	learn: 0.3494417	total: 1.63s	remaining: 1.93s
458:	learn: 0.3490944	total: 1.64s	remaining: 1.93s
459:	learn: 0.3487102	total: 1.64s	remaining: 1.92s
460:	learn: 0.3484557	total: 1.64s	remaining: 1.92s
461:	learn: 0.3478789	total: 1.65s	remaining: 1.92s
462:	learn: 0.3474275	total: 1.65s	remaining: 1.91s
463:	learn: 0.3468342	total: 1.65s	remaining: 1.91s
464:	learn: 0.3463040	total: 1.66s	remaining: 1.91s
465:	learn: 0.3458213	total: 1.66s	remaining: 1.9s
466:	learn: 0.3451803	total: 1.66s	remaining: 1.9s
467:	learn: 0.3447292	total: 1.67s	remaining: 1.89s
468:	learn: 0.3443530	total: 1.67s	remaining: 1.89s
469:	learn: 0.3437850	total: 1.67s	remaining: 1.89s
470:	learn: 0.3433565	total: 1.68s	remaining: 1.88s
471:	learn: 0.3430048	total: 1.68s	remaining: 1.88s
472:	learn: 0.3425770	total: 1.68s	remaining: 1.88s
473:	learn: 0.3419536	total: 1.69s	remaining: 1.87s
474:	learn: 0.3414171	total: 1.69s	remaining: 1.87s
475:	learn: 0.3409048	total: 1.69s	remaining: 1.86s
476:	learn: 0.3406389	total: 1.7s	remaining: 1.86s
477:	learn: 0.3402231	total: 1.7s	remaining: 1.86s
478:	learn: 0.3397786	total: 1.7s	remaining: 1.85s
479:	learn: 0.3396398	total: 1.71s	remaining: 1.85s
480:	learn: 0.3395825	total: 1.71s	remaining: 1.85s
481:	learn: 0.3391371	total: 1.71s	remaining: 1.84s
482:	learn: 0.3386234	total: 1.72s	remaining: 1.84s
483:	learn: 0.3382257	total: 1.72s	remaining: 1.83s
484:	learn: 0.3378495	total: 1.72s	remaining: 1.83s
485:	learn: 0.3375195	total: 1.73s	remaining: 1.83s
486:	learn: 0.3371611	total: 1.73s	remaining: 1.82s
487:	learn: 0.3367422	total: 1.73s	remaining: 1.82s
488:	learn: 0.3363054	total: 1.74s	remaining: 1.82s
489:	learn: 0.3359786	total: 1.74s	remaining: 1.81s
490:	learn: 0.3356998	total: 1.75s	remaining: 1.81s
491:	learn: 0.3354453	total: 1.75s	remaining: 1.8s
492:	learn: 0.3352187	total: 1.75s	remaining: 1.8s
493:	learn: 0.3350124	total: 1.75s	remaining: 1.8s
494:	learn: 0.3346929	total: 1.76s	remaining: 1.79s
495:	learn: 0.3342032	total: 1.76s	remaining: 1.79s
496:	learn: 0.3336972	total: 1.77s	remaining: 1.79s
497:	learn: 0.3336357	total: 1.77s	remaining: 1.78s
498:	learn: 0.3333691	total: 1.77s	remaining: 1.78s
499:	learn: 0.3329063	total: 1.78s	remaining: 1.78s
500:	learn: 0.3326459	total: 1.78s	remaining: 1.77s
501:	learn: 0.3323511	total: 1.78s	remaining: 1.77s
502:	learn: 0.3319477	total: 1.79s	remaining: 1.76s
503:	learn: 0.3317364	total: 1.79s	remaining: 1.76s
504:	learn: 0.3312039	total: 1.79s	remaining: 1.76s
505:	learn: 0.3305307	total: 1.8s	remaining: 1.75s
506:	learn: 0.3300698	total: 1.8s	remaining: 1.75s
507:	learn: 0.3296915	total: 1.8s	remaining: 1.75s
508:	learn: 0.3294125	total: 1.81s	remaining: 1.74s
509:	learn: 0.3291011	total: 1.81s	remaining: 1.74s
510:	learn: 0.3289021	total: 1.81s	remaining: 1.74s
511:	learn: 0.3285137	total: 1.82s	remaining: 1.73s
512:	learn: 0.3278964	total: 1.82s	remaining: 1.73s
513:	learn: 0.3274555	total: 1.82s	remaining: 1.72s
514:	learn: 0.3270691	total: 1.83s	remaining: 1.72s
515:	learn: 0.3268052	total: 1.83s	remaining: 1.72s
516:	learn: 0.3263295	total: 1.83s	remaining: 1.71s
517:	learn: 0.3258524	total: 1.84s	remaining: 1.71s
518:	learn: 0.3255794	total: 1.84s	remaining: 1.71s
519:	learn: 0.3251552	total: 1.84s	remaining: 1.7s
520:	learn: 0.3247569	total: 1.85s	remaining: 1.7s
521:	learn: 0.3242420	total: 1.85s	remaining: 1.7s
522:	learn: 0.3240746	total: 1.85s	remaining: 1.69s
523:	learn: 0.3239854	total: 1.86s	remaining: 1.69s
524:	learn: 0.3239415	total: 1.86s	remaining: 1.68s
525:	learn: 0.3236346	total: 1.86s	remaining: 1.68s
526:	learn: 0.3233439	total: 1.87s	remaining: 1.68s
527:	learn: 0.3232261	total: 1.87s	remaining: 1.67s
528:	learn: 0.3226489	total: 1.87s	remaining: 1.67s
529:	learn: 0.3221208	total: 1.88s	remaining: 1.67s
530:	learn: 0.3217815	total: 1.88s	remaining: 1.66s
531:	learn: 0.3216486	total: 1.88s	remaining: 1.66s
532:	learn: 0.3213358	total: 1.89s	remaining: 1.65s
533:	learn: 0.3211304	total: 1.89s	remaining: 1.65s
534:	learn: 0.3210996	total: 1.89s	remaining: 1.65s
535:	learn: 0.3207515	total: 1.9s	remaining: 1.64s
536:	learn: 0.3204738	total: 1.9s	remaining: 1.64s
537:	learn: 0.3198882	total: 1.9s	remaining: 1.64s
538:	learn: 0.3195154	total: 1.91s	remaining: 1.63s
539:	learn: 0.3192952	total: 1.91s	remaining: 1.63s
540:	learn: 0.3190280	total: 1.92s	remaining: 1.62s
541:	learn: 0.3187129	total: 1.92s	remaining: 1.62s
542:	learn: 0.3181960	total: 1.92s	remaining: 1.62s
543:	learn: 0.3176606	total: 1.93s	remaining: 1.61s
544:	learn: 0.3170949	total: 1.93s	remaining: 1.61s
545:	learn: 0.3167459	total: 1.93s	remaining: 1.61s
546:	learn: 0.3163222	total: 1.94s	remaining: 1.6s
547:	learn: 0.3161420	total: 1.94s	remaining: 1.6s
548:	learn: 0.3157992	total: 1.94s	remaining: 1.6s
549:	learn: 0.3152920	total: 1.95s	remaining: 1.59s
550:	learn: 0.3148228	total: 1.95s	remaining: 1.59s
551:	learn: 0.3145167	total: 1.95s	remaining: 1.58s
552:	learn: 0.3139810	total: 1.96s	remaining: 1.58s
553:	learn: 0.3133923	total: 1.96s	remaining: 1.58s
554:	learn: 0.3129725	total: 1.96s	remaining: 1.57s
555:	learn: 0.3125777	total: 1.97s	remaining: 1.57s
556:	learn: 0.3123591	total: 1.97s	remaining: 1.57s
557:	learn: 0.3118976	total: 1.97s	remaining: 1.56s
558:	learn: 0.3117266	total: 1.98s	remaining: 1.56s
559:	learn: 0.3114944	total: 1.98s	remaining: 1.56s
560:	learn: 0.3112216	total: 1.98s	remaining: 1.55s
561:	learn: 0.3108621	total: 1.99s	remaining: 1.55s
562:	learn: 0.3104885	total: 1.99s	remaining: 1.54s
563:	learn: 0.3103672	total: 1.99s	remaining: 1.54s
564:	learn: 0.3099546	total: 2s	remaining: 1.54s
565:	learn: 0.3094861	total: 2s	remaining: 1.53s
566:	learn: 0.3091734	total: 2s	remaining: 1.53s
567:	learn: 0.3089148	total: 2.01s	remaining: 1.53s
568:	learn: 0.3086673	total: 2.01s	remaining: 1.52s
569:	learn: 0.3082695	total: 2.01s	remaining: 1.52s
570:	learn: 0.3078891	total: 2.02s	remaining: 1.52s
571:	learn: 0.3074927	total: 2.02s	remaining: 1.51s
572:	learn: 0.3069862	total: 2.02s	remaining: 1.51s
573:	learn: 0.3064039	total: 2.03s	remaining: 1.5s
574:	learn: 0.3061550	total: 2.03s	remaining: 1.5s
575:	learn: 0.3056908	total: 2.04s	remaining: 1.5s
576:	learn: 0.3052593	total: 2.04s	remaining: 1.49s
577:	learn: 0.3049534	total: 2.04s	remaining: 1.49s
578:	learn: 0.3044688	total: 2.04s	remaining: 1.49s
579:	learn: 0.3041772	total: 2.05s	remaining: 1.48s
580:	learn: 0.3037533	total: 2.05s	remaining: 1.48s
581:	learn: 0.3035700	total: 2.06s	remaining: 1.48s
582:	learn: 0.3031629	total: 2.06s	remaining: 1.47s
583:	learn: 0.3028838	total: 2.06s	remaining: 1.47s
584:	learn: 0.3023030	total: 2.07s	remaining: 1.47s
585:	learn: 0.3019222	total: 2.07s	remaining: 1.46s
586:	learn: 0.3015915	total: 2.07s	remaining: 1.46s
587:	learn: 0.3015704	total: 2.08s	remaining: 1.46s
588:	learn: 0.3009768	total: 2.08s	remaining: 1.45s
589:	learn: 0.3008180	total: 2.08s	remaining: 1.45s
590:	learn: 0.3005118	total: 2.09s	remaining: 1.44s
591:	learn: 0.3002814	total: 2.09s	remaining: 1.44s
592:	learn: 0.3000431	total: 2.09s	remaining: 1.44s
593:	learn: 0.2997323	total: 2.1s	remaining: 1.43s
594:	learn: 0.2994389	total: 2.1s	remaining: 1.43s
595:	learn: 0.2993389	total: 2.1s	remaining: 1.43s
596:	learn: 0.2988775	total: 2.11s	remaining: 1.42s
597:	learn: 0.2987598	total: 2.11s	remaining: 1.42s
598:	learn: 0.2984033	total: 2.12s	remaining: 1.42s
599:	learn: 0.2978245	total: 2.12s	remaining: 1.41s
600:	learn: 0.2974830	total: 2.12s	remaining: 1.41s
601:	learn: 0.2969405	total: 2.13s	remaining: 1.4s
602:	learn: 0.2966278	total: 2.13s	remaining: 1.4s
603:	learn: 0.2962533	total: 2.13s	remaining: 1.4s
604:	learn: 0.2959855	total: 2.14s	remaining: 1.39s
605:	learn: 0.2956097	total: 2.14s	remaining: 1.39s
606:	learn: 0.2953299	total: 2.14s	remaining: 1.39s
607:	learn: 0.2951260	total: 2.15s	remaining: 1.39s
608:	learn: 0.2947087	total: 2.15s	remaining: 1.38s
609:	learn: 0.2944823	total: 2.16s	remaining: 1.38s
610:	learn: 0.2943654	total: 2.16s	remaining: 1.38s
611:	learn: 0.2941610	total: 2.16s	remaining: 1.37s
612:	learn: 0.2938108	total: 2.17s	remaining: 1.37s
613:	learn: 0.2934358	total: 2.17s	remaining: 1.36s
614:	learn: 0.2931197	total: 2.17s	remaining: 1.36s
615:	learn: 0.2930426	total: 2.18s	remaining: 1.36s
616:	learn: 0.2928295	total: 2.18s	remaining: 1.35s
617:	learn: 0.2924921	total: 2.18s	remaining: 1.35s
618:	learn: 0.2921751	total: 2.19s	remaining: 1.35s
619:	learn: 0.2918991	total: 2.19s	remaining: 1.34s
620:	learn: 0.2913899	total: 2.19s	remaining: 1.34s
621:	learn: 0.2908501	total: 2.2s	remaining: 1.34s
622:	learn: 0.2905626	total: 2.2s	remaining: 1.33s
623:	learn: 0.2902904	total: 2.21s	remaining: 1.33s
624:	learn: 0.2900379	total: 2.21s	remaining: 1.32s
625:	learn: 0.2896903	total: 2.21s	remaining: 1.32s
626:	learn: 0.2892638	total: 2.22s	remaining: 1.32s
627:	learn: 0.2889578	total: 2.22s	remaining: 1.31s
628:	learn: 0.2887475	total: 2.22s	remaining: 1.31s
629:	learn: 0.2883429	total: 2.23s	remaining: 1.31s
630:	learn: 0.2881113	total: 2.23s	remaining: 1.3s
631:	learn: 0.2876183	total: 2.23s	remaining: 1.3s
632:	learn: 0.2875565	total: 2.24s	remaining: 1.3s
633:	learn: 0.2871677	total: 2.24s	remaining: 1.29s
634:	learn: 0.2868290	total: 2.24s	remaining: 1.29s
635:	learn: 0.2866803	total: 2.25s	remaining: 1.29s
636:	learn: 0.2864461	total: 2.25s	remaining: 1.28s
637:	learn: 0.2861437	total: 2.25s	remaining: 1.28s
638:	learn: 0.2858220	total: 2.26s	remaining: 1.27s
639:	learn: 0.2856065	total: 2.26s	remaining: 1.27s
640:	learn: 0.2853185	total: 2.26s	remaining: 1.27s
641:	learn: 0.2849693	total: 2.27s	remaining: 1.26s
642:	learn: 0.2844873	total: 2.27s	remaining: 1.26s
643:	learn: 0.2841484	total: 2.27s	remaining: 1.26s
644:	learn: 0.2838365	total: 2.28s	remaining: 1.25s
645:	learn: 0.2835952	total: 2.28s	remaining: 1.25s
646:	learn: 0.2831927	total: 2.28s	remaining: 1.25s
647:	learn: 0.2827722	total: 2.29s	remaining: 1.24s
648:	learn: 0.2824569	total: 2.29s	remaining: 1.24s
649:	learn: 0.2821450	total: 2.29s	remaining: 1.24s
650:	learn: 0.2820499	total: 2.3s	remaining: 1.23s
651:	learn: 0.2818476	total: 2.3s	remaining: 1.23s
652:	learn: 0.2815995	total: 2.31s	remaining: 1.23s
653:	learn: 0.2812118	total: 2.31s	remaining: 1.22s
654:	learn: 0.2809202	total: 2.31s	remaining: 1.22s
655:	learn: 0.2806815	total: 2.32s	remaining: 1.22s
656:	learn: 0.2802227	total: 2.32s	remaining: 1.21s
657:	learn: 0.2800624	total: 2.33s	remaining: 1.21s
658:	learn: 0.2795849	total: 2.33s	remaining: 1.21s
659:	learn: 0.2793078	total: 2.33s	remaining: 1.2s
660:	learn: 0.2791128	total: 2.34s	remaining: 1.2s
661:	learn: 0.2788399	total: 2.34s	remaining: 1.19s
662:	learn: 0.2788037	total: 2.34s	remaining: 1.19s
663:	learn: 0.2786668	total: 2.35s	remaining: 1.19s
664:	learn: 0.2784946	total: 2.35s	remaining: 1.18s
665:	learn: 0.2780830	total: 2.35s	remaining: 1.18s
666:	learn: 0.2778176	total: 2.36s	remaining: 1.18s
667:	learn: 0.2775362	total: 2.36s	remaining: 1.17s
668:	learn: 0.2771948	total: 2.36s	remaining: 1.17s
669:	learn: 0.2770151	total: 2.37s	remaining: 1.17s
670:	learn: 0.2766705	total: 2.37s	remaining: 1.16s
671:	learn: 0.2761992	total: 2.38s	remaining: 1.16s
672:	learn: 0.2761667	total: 2.38s	remaining: 1.16s
673:	learn: 0.2758894	total: 2.38s	remaining: 1.15s
674:	learn: 0.2754654	total: 2.39s	remaining: 1.15s
675:	learn: 0.2751694	total: 2.39s	remaining: 1.15s
676:	learn: 0.2747818	total: 2.39s	remaining: 1.14s
677:	learn: 0.2743284	total: 2.4s	remaining: 1.14s
678:	learn: 0.2738872	total: 2.4s	remaining: 1.13s
679:	learn: 0.2735984	total: 2.4s	remaining: 1.13s
680:	learn: 0.2732869	total: 2.41s	remaining: 1.13s
681:	learn: 0.2730972	total: 2.41s	remaining: 1.12s
682:	learn: 0.2728736	total: 2.41s	remaining: 1.12s
683:	learn: 0.2726755	total: 2.42s	remaining: 1.12s
684:	learn: 0.2723927	total: 2.42s	remaining: 1.11s
685:	learn: 0.2719884	total: 2.42s	remaining: 1.11s
686:	learn: 0.2715762	total: 2.43s	remaining: 1.1s
687:	learn: 0.2713579	total: 2.43s	remaining: 1.1s
688:	learn: 0.2711750	total: 2.43s	remaining: 1.1s
689:	learn: 0.2709947	total: 2.44s	remaining: 1.09s
690:	learn: 0.2707097	total: 2.44s	remaining: 1.09s
691:	learn: 0.2704260	total: 2.44s	remaining: 1.09s
692:	learn: 0.2701645	total: 2.45s	remaining: 1.08s
693:	learn: 0.2698393	total: 2.45s	remaining: 1.08s
694:	learn: 0.2695105	total: 2.45s	remaining: 1.08s
695:	learn: 0.2691283	total: 2.46s	remaining: 1.07s
696:	learn: 0.2688368	total: 2.46s	remaining: 1.07s
697:	learn: 0.2683466	total: 2.46s	remaining: 1.07s
698:	learn: 0.2680603	total: 2.47s	remaining: 1.06s
699:	learn: 0.2677699	total: 2.47s	remaining: 1.06s
700:	learn: 0.2674562	total: 2.48s	remaining: 1.05s
701:	learn: 0.2671464	total: 2.48s	remaining: 1.05s
702:	learn: 0.2668508	total: 2.48s	remaining: 1.05s
703:	learn: 0.2666627	total: 2.48s	remaining: 1.04s
704:	learn: 0.2663736	total: 2.49s	remaining: 1.04s
705:	learn: 0.2660888	total: 2.49s	remaining: 1.04s
706:	learn: 0.2658267	total: 2.5s	remaining: 1.03s
707:	learn: 0.2656145	total: 2.5s	remaining: 1.03s
708:	learn: 0.2654194	total: 2.5s	remaining: 1.03s
709:	learn: 0.2651461	total: 2.51s	remaining: 1.02s
710:	learn: 0.2647736	total: 2.51s	remaining: 1.02s
711:	learn: 0.2644030	total: 2.51s	remaining: 1.02s
712:	learn: 0.2641493	total: 2.52s	remaining: 1.01s
713:	learn: 0.2636034	total: 2.52s	remaining: 1.01s
714:	learn: 0.2633467	total: 2.52s	remaining: 1.01s
715:	learn: 0.2630884	total: 2.53s	remaining: 1s
716:	learn: 0.2627231	total: 2.53s	remaining: 999ms
717:	learn: 0.2623737	total: 2.53s	remaining: 995ms
718:	learn: 0.2620890	total: 2.54s	remaining: 992ms
719:	learn: 0.2619557	total: 2.54s	remaining: 988ms
720:	learn: 0.2615330	total: 2.54s	remaining: 985ms
721:	learn: 0.2613678	total: 2.55s	remaining: 982ms
722:	learn: 0.2609768	total: 2.55s	remaining: 978ms
723:	learn: 0.2607843	total: 2.56s	remaining: 974ms
724:	learn: 0.2604017	total: 2.56s	remaining: 971ms
725:	learn: 0.2601656	total: 2.56s	remaining: 968ms
726:	learn: 0.2598170	total: 2.57s	remaining: 964ms
727:	learn: 0.2596272	total: 2.57s	remaining: 960ms
728:	learn: 0.2594368	total: 2.57s	remaining: 957ms
729:	learn: 0.2591430	total: 2.58s	remaining: 953ms
730:	learn: 0.2588290	total: 2.58s	remaining: 950ms
731:	learn: 0.2585789	total: 2.58s	remaining: 946ms
732:	learn: 0.2584455	total: 2.59s	remaining: 943ms
733:	learn: 0.2584104	total: 2.59s	remaining: 939ms
734:	learn: 0.2582152	total: 2.59s	remaining: 935ms
735:	learn: 0.2581397	total: 2.6s	remaining: 932ms
736:	learn: 0.2578583	total: 2.6s	remaining: 929ms
737:	learn: 0.2576197	total: 2.6s	remaining: 925ms
738:	learn: 0.2573741	total: 2.61s	remaining: 922ms
739:	learn: 0.2571082	total: 2.61s	remaining: 918ms
740:	learn: 0.2568704	total: 2.62s	remaining: 915ms
741:	learn: 0.2566573	total: 2.62s	remaining: 911ms
742:	learn: 0.2564107	total: 2.62s	remaining: 907ms
743:	learn: 0.2561631	total: 2.63s	remaining: 904ms
744:	learn: 0.2561312	total: 2.63s	remaining: 901ms
745:	learn: 0.2557099	total: 2.63s	remaining: 897ms
746:	learn: 0.2554609	total: 2.64s	remaining: 894ms
747:	learn: 0.2552453	total: 2.64s	remaining: 890ms
748:	learn: 0.2549054	total: 2.65s	remaining: 887ms
749:	learn: 0.2546371	total: 2.65s	remaining: 884ms
750:	learn: 0.2543651	total: 2.65s	remaining: 880ms
751:	learn: 0.2537761	total: 2.66s	remaining: 876ms
752:	learn: 0.2535814	total: 2.66s	remaining: 873ms
753:	learn: 0.2531956	total: 2.67s	remaining: 870ms
754:	learn: 0.2531389	total: 2.67s	remaining: 866ms
755:	learn: 0.2529298	total: 2.67s	remaining: 862ms
756:	learn: 0.2527376	total: 2.68s	remaining: 859ms
757:	learn: 0.2524734	total: 2.68s	remaining: 856ms
758:	learn: 0.2521595	total: 2.68s	remaining: 852ms
759:	learn: 0.2519621	total: 2.69s	remaining: 849ms
760:	learn: 0.2516846	total: 2.69s	remaining: 846ms
761:	learn: 0.2512891	total: 2.7s	remaining: 842ms
762:	learn: 0.2510728	total: 2.7s	remaining: 839ms
763:	learn: 0.2507176	total: 2.7s	remaining: 835ms
764:	learn: 0.2503362	total: 2.71s	remaining: 832ms
765:	learn: 0.2502037	total: 2.71s	remaining: 829ms
766:	learn: 0.2498936	total: 2.71s	remaining: 825ms
767:	learn: 0.2496633	total: 2.72s	remaining: 821ms
768:	learn: 0.2493097	total: 2.72s	remaining: 818ms
769:	learn: 0.2489150	total: 2.73s	remaining: 815ms
770:	learn: 0.2486851	total: 2.73s	remaining: 811ms
771:	learn: 0.2483194	total: 2.73s	remaining: 807ms
772:	learn: 0.2479421	total: 2.74s	remaining: 804ms
773:	learn: 0.2478561	total: 2.74s	remaining: 801ms
774:	learn: 0.2475209	total: 2.75s	remaining: 797ms
775:	learn: 0.2471628	total: 2.75s	remaining: 794ms
776:	learn: 0.2469314	total: 2.75s	remaining: 790ms
777:	learn: 0.2466425	total: 2.76s	remaining: 787ms
778:	learn: 0.2465065	total: 2.76s	remaining: 783ms
779:	learn: 0.2461220	total: 2.76s	remaining: 780ms
780:	learn: 0.2458115	total: 2.77s	remaining: 776ms
781:	learn: 0.2456707	total: 2.77s	remaining: 773ms
782:	learn: 0.2454929	total: 2.77s	remaining: 769ms
783:	learn: 0.2451706	total: 2.78s	remaining: 766ms
784:	learn: 0.2448523	total: 2.78s	remaining: 762ms
785:	learn: 0.2445772	total: 2.79s	remaining: 759ms
786:	learn: 0.2443875	total: 2.79s	remaining: 755ms
787:	learn: 0.2441339	total: 2.79s	remaining: 751ms
788:	learn: 0.2438773	total: 2.8s	remaining: 748ms
789:	learn: 0.2438140	total: 2.8s	remaining: 744ms
790:	learn: 0.2434810	total: 2.8s	remaining: 741ms
791:	learn: 0.2429906	total: 2.81s	remaining: 737ms
792:	learn: 0.2426931	total: 2.81s	remaining: 734ms
793:	learn: 0.2424043	total: 2.81s	remaining: 730ms
794:	learn: 0.2421210	total: 2.82s	remaining: 726ms
795:	learn: 0.2419921	total: 2.82s	remaining: 723ms
796:	learn: 0.2418543	total: 2.82s	remaining: 719ms
797:	learn: 0.2418231	total: 2.83s	remaining: 716ms
798:	learn: 0.2415865	total: 2.83s	remaining: 712ms
799:	learn: 0.2412327	total: 2.83s	remaining: 709ms
800:	learn: 0.2410130	total: 2.84s	remaining: 705ms
801:	learn: 0.2406898	total: 2.84s	remaining: 702ms
802:	learn: 0.2403997	total: 2.85s	remaining: 698ms
803:	learn: 0.2403092	total: 2.85s	remaining: 694ms
804:	learn: 0.2399602	total: 2.85s	remaining: 691ms
805:	learn: 0.2397046	total: 2.85s	remaining: 687ms
806:	learn: 0.2395811	total: 2.86s	remaining: 684ms
807:	learn: 0.2393554	total: 2.86s	remaining: 680ms
808:	learn: 0.2389797	total: 2.87s	remaining: 677ms
809:	learn: 0.2386822	total: 2.87s	remaining: 673ms
810:	learn: 0.2385272	total: 2.87s	remaining: 669ms
811:	learn: 0.2383253	total: 2.88s	remaining: 666ms
812:	learn: 0.2382769	total: 2.88s	remaining: 662ms
813:	learn: 0.2380034	total: 2.88s	remaining: 659ms
814:	learn: 0.2378123	total: 2.88s	remaining: 655ms
815:	learn: 0.2376812	total: 2.89s	remaining: 651ms
816:	learn: 0.2375900	total: 2.89s	remaining: 648ms
817:	learn: 0.2373746	total: 2.9s	remaining: 644ms
818:	learn: 0.2368950	total: 2.9s	remaining: 641ms
819:	learn: 0.2365626	total: 2.9s	remaining: 637ms
820:	learn: 0.2364160	total: 2.91s	remaining: 634ms
821:	learn: 0.2362066	total: 2.91s	remaining: 630ms
822:	learn: 0.2357433	total: 2.91s	remaining: 627ms
823:	learn: 0.2355107	total: 2.92s	remaining: 623ms
824:	learn: 0.2353031	total: 2.92s	remaining: 620ms
825:	learn: 0.2349189	total: 2.92s	remaining: 616ms
826:	learn: 0.2345722	total: 2.93s	remaining: 612ms
827:	learn: 0.2342914	total: 2.93s	remaining: 609ms
828:	learn: 0.2340122	total: 2.93s	remaining: 605ms
829:	learn: 0.2337871	total: 2.94s	remaining: 602ms
830:	learn: 0.2337141	total: 2.94s	remaining: 598ms
831:	learn: 0.2334017	total: 2.94s	remaining: 595ms
832:	learn: 0.2331498	total: 2.95s	remaining: 591ms
833:	learn: 0.2329182	total: 2.95s	remaining: 588ms
834:	learn: 0.2326496	total: 2.96s	remaining: 584ms
835:	learn: 0.2323721	total: 2.96s	remaining: 580ms
836:	learn: 0.2321663	total: 2.96s	remaining: 577ms
837:	learn: 0.2317915	total: 2.96s	remaining: 573ms
838:	learn: 0.2317042	total: 2.97s	remaining: 570ms
839:	learn: 0.2315415	total: 2.97s	remaining: 566ms
840:	learn: 0.2312144	total: 2.98s	remaining: 563ms
841:	learn: 0.2309861	total: 2.98s	remaining: 559ms
842:	learn: 0.2308992	total: 2.98s	remaining: 556ms
843:	learn: 0.2307248	total: 2.99s	remaining: 552ms
844:	learn: 0.2304123	total: 2.99s	remaining: 548ms
845:	learn: 0.2302900	total: 2.99s	remaining: 545ms
846:	learn: 0.2299298	total: 3s	remaining: 541ms
847:	learn: 0.2297649	total: 3s	remaining: 538ms
848:	learn: 0.2296314	total: 3s	remaining: 534ms
849:	learn: 0.2292939	total: 3.01s	remaining: 531ms
850:	learn: 0.2291602	total: 3.01s	remaining: 527ms
851:	learn: 0.2288865	total: 3.01s	remaining: 523ms
852:	learn: 0.2286853	total: 3.02s	remaining: 520ms
853:	learn: 0.2285428	total: 3.02s	remaining: 516ms
854:	learn: 0.2284197	total: 3.02s	remaining: 513ms
855:	learn: 0.2282277	total: 3.03s	remaining: 509ms
856:	learn: 0.2278907	total: 3.03s	remaining: 506ms
857:	learn: 0.2277383	total: 3.03s	remaining: 502ms
858:	learn: 0.2275067	total: 3.04s	remaining: 499ms
859:	learn: 0.2272363	total: 3.04s	remaining: 495ms
860:	learn: 0.2269635	total: 3.04s	remaining: 492ms
861:	learn: 0.2265979	total: 3.05s	remaining: 488ms
862:	learn: 0.2264905	total: 3.05s	remaining: 484ms
863:	learn: 0.2262386	total: 3.06s	remaining: 481ms
864:	learn: 0.2260311	total: 3.06s	remaining: 477ms
865:	learn: 0.2258230	total: 3.06s	remaining: 474ms
866:	learn: 0.2255610	total: 3.06s	remaining: 470ms
867:	learn: 0.2254607	total: 3.07s	remaining: 467ms
868:	learn: 0.2250433	total: 3.07s	remaining: 463ms
869:	learn: 0.2249648	total: 3.08s	remaining: 460ms
870:	learn: 0.2247922	total: 3.08s	remaining: 456ms
871:	learn: 0.2244233	total: 3.08s	remaining: 452ms
872:	learn: 0.2242406	total: 3.08s	remaining: 449ms
873:	learn: 0.2240229	total: 3.09s	remaining: 445ms
874:	learn: 0.2238258	total: 3.09s	remaining: 442ms
875:	learn: 0.2237571	total: 3.1s	remaining: 438ms
876:	learn: 0.2235548	total: 3.1s	remaining: 435ms
877:	learn: 0.2234135	total: 3.1s	remaining: 431ms
878:	learn: 0.2230922	total: 3.11s	remaining: 428ms
879:	learn: 0.2226509	total: 3.11s	remaining: 424ms
880:	learn: 0.2224404	total: 3.11s	remaining: 421ms
881:	learn: 0.2223705	total: 3.12s	remaining: 417ms
882:	learn: 0.2223260	total: 3.12s	remaining: 414ms
883:	learn: 0.2220318	total: 3.13s	remaining: 410ms
884:	learn: 0.2218552	total: 3.13s	remaining: 407ms
885:	learn: 0.2217687	total: 3.13s	remaining: 403ms
886:	learn: 0.2214940	total: 3.13s	remaining: 399ms
887:	learn: 0.2213258	total: 3.14s	remaining: 396ms
888:	learn: 0.2212884	total: 3.14s	remaining: 392ms
889:	learn: 0.2210636	total: 3.15s	remaining: 389ms
890:	learn: 0.2208620	total: 3.15s	remaining: 385ms
891:	learn: 0.2207268	total: 3.15s	remaining: 382ms
892:	learn: 0.2205061	total: 3.16s	remaining: 378ms
893:	learn: 0.2203278	total: 3.16s	remaining: 375ms
894:	learn: 0.2200449	total: 3.16s	remaining: 371ms
895:	learn: 0.2197147	total: 3.17s	remaining: 368ms
896:	learn: 0.2195113	total: 3.17s	remaining: 364ms
897:	learn: 0.2192671	total: 3.17s	remaining: 360ms
898:	learn: 0.2192258	total: 3.18s	remaining: 357ms
899:	learn: 0.2189515	total: 3.18s	remaining: 353ms
900:	learn: 0.2186505	total: 3.18s	remaining: 350ms
901:	learn: 0.2184193	total: 3.19s	remaining: 346ms
902:	learn: 0.2183672	total: 3.19s	remaining: 343ms
903:	learn: 0.2181273	total: 3.19s	remaining: 339ms
904:	learn: 0.2178674	total: 3.2s	remaining: 336ms
905:	learn: 0.2176005	total: 3.2s	remaining: 332ms
906:	learn: 0.2174875	total: 3.2s	remaining: 329ms
907:	learn: 0.2171777	total: 3.21s	remaining: 325ms
908:	learn: 0.2169544	total: 3.21s	remaining: 322ms
909:	learn: 0.2167341	total: 3.22s	remaining: 318ms
910:	learn: 0.2166133	total: 3.22s	remaining: 315ms
911:	learn: 0.2164535	total: 3.22s	remaining: 311ms
912:	learn: 0.2164128	total: 3.23s	remaining: 307ms
913:	learn: 0.2162686	total: 3.23s	remaining: 304ms
914:	learn: 0.2160413	total: 3.23s	remaining: 300ms
915:	learn: 0.2158222	total: 3.24s	remaining: 297ms
916:	learn: 0.2157209	total: 3.24s	remaining: 293ms
917:	learn: 0.2155155	total: 3.24s	remaining: 290ms
918:	learn: 0.2154142	total: 3.25s	remaining: 286ms
919:	learn: 0.2151079	total: 3.25s	remaining: 283ms
920:	learn: 0.2149705	total: 3.25s	remaining: 279ms
921:	learn: 0.2149322	total: 3.26s	remaining: 276ms
922:	learn: 0.2148807	total: 3.26s	remaining: 272ms
923:	learn: 0.2146427	total: 3.27s	remaining: 269ms
924:	learn: 0.2145465	total: 3.27s	remaining: 265ms
925:	learn: 0.2143896	total: 3.27s	remaining: 261ms
926:	learn: 0.2141042	total: 3.27s	remaining: 258ms
927:	learn: 0.2140653	total: 3.28s	remaining: 254ms
928:	learn: 0.2138073	total: 3.28s	remaining: 251ms
929:	learn: 0.2135790	total: 3.29s	remaining: 247ms
930:	learn: 0.2135487	total: 3.29s	remaining: 244ms
931:	learn: 0.2134147	total: 3.29s	remaining: 240ms
932:	learn: 0.2132543	total: 3.3s	remaining: 237ms
933:	learn: 0.2131319	total: 3.3s	remaining: 233ms
934:	learn: 0.2128832	total: 3.3s	remaining: 230ms
935:	learn: 0.2125868	total: 3.31s	remaining: 226ms
936:	learn: 0.2124035	total: 3.31s	remaining: 223ms
937:	learn: 0.2121759	total: 3.31s	remaining: 219ms
938:	learn: 0.2120445	total: 3.32s	remaining: 216ms
939:	learn: 0.2118077	total: 3.32s	remaining: 212ms
940:	learn: 0.2114668	total: 3.33s	remaining: 208ms
941:	learn: 0.2111545	total: 3.33s	remaining: 205ms
942:	learn: 0.2111246	total: 3.33s	remaining: 201ms
943:	learn: 0.2109490	total: 3.33s	remaining: 198ms
944:	learn: 0.2107927	total: 3.34s	remaining: 194ms
945:	learn: 0.2105585	total: 3.34s	remaining: 191ms
946:	learn: 0.2105239	total: 3.35s	remaining: 187ms
947:	learn: 0.2104112	total: 3.35s	remaining: 184ms
948:	learn: 0.2101941	total: 3.35s	remaining: 180ms
949:	learn: 0.2101563	total: 3.36s	remaining: 177ms
950:	learn: 0.2101322	total: 3.36s	remaining: 173ms
951:	learn: 0.2098230	total: 3.36s	remaining: 170ms
952:	learn: 0.2097301	total: 3.37s	remaining: 166ms
953:	learn: 0.2095164	total: 3.37s	remaining: 163ms
954:	learn: 0.2093993	total: 3.37s	remaining: 159ms
955:	learn: 0.2091224	total: 3.38s	remaining: 155ms
956:	learn: 0.2089994	total: 3.38s	remaining: 152ms
957:	learn: 0.2088083	total: 3.38s	remaining: 148ms
958:	learn: 0.2087013	total: 3.39s	remaining: 145ms
959:	learn: 0.2084003	total: 3.39s	remaining: 141ms
960:	learn: 0.2082851	total: 3.4s	remaining: 138ms
961:	learn: 0.2080820	total: 3.4s	remaining: 134ms
962:	learn: 0.2078811	total: 3.4s	remaining: 131ms
963:	learn: 0.2076056	total: 3.4s	remaining: 127ms
964:	learn: 0.2074108	total: 3.41s	remaining: 124ms
965:	learn: 0.2070406	total: 3.41s	remaining: 120ms
966:	learn: 0.2067699	total: 3.42s	remaining: 117ms
967:	learn: 0.2063896	total: 3.42s	remaining: 113ms
968:	learn: 0.2060907	total: 3.42s	remaining: 110ms
969:	learn: 0.2059021	total: 3.43s	remaining: 106ms
970:	learn: 0.2056345	total: 3.43s	remaining: 102ms
971:	learn: 0.2053408	total: 3.43s	remaining: 98.9ms
972:	learn: 0.2050676	total: 3.44s	remaining: 95.4ms
973:	learn: 0.2048304	total: 3.44s	remaining: 91.9ms
974:	learn: 0.2046749	total: 3.44s	remaining: 88.3ms
975:	learn: 0.2044718	total: 3.45s	remaining: 84.8ms
976:	learn: 0.2041876	total: 3.45s	remaining: 81.3ms
977:	learn: 0.2039090	total: 3.46s	remaining: 77.7ms
978:	learn: 0.2037805	total: 3.46s	remaining: 74.2ms
979:	learn: 0.2034705	total: 3.46s	remaining: 70.7ms
980:	learn: 0.2032025	total: 3.47s	remaining: 67.1ms
981:	learn: 0.2030209	total: 3.47s	remaining: 63.6ms
982:	learn: 0.2027762	total: 3.47s	remaining: 60.1ms
983:	learn: 0.2026031	total: 3.48s	remaining: 56.5ms
984:	learn: 0.2023929	total: 3.48s	remaining: 53ms
985:	learn: 0.2021011	total: 3.48s	remaining: 49.5ms
986:	learn: 0.2018960	total: 3.49s	remaining: 45.9ms
987:	learn: 0.2017884	total: 3.49s	remaining: 42.4ms
988:	learn: 0.2014352	total: 3.49s	remaining: 38.9ms
989:	learn: 0.2012731	total: 3.5s	remaining: 35.3ms
990:	learn: 0.2010667	total: 3.5s	remaining: 31.8ms
991:	learn: 0.2008429	total: 3.5s	remaining: 28.3ms
992:	learn: 0.2006669	total: 3.51s	remaining: 24.7ms
993:	learn: 0.2005137	total: 3.51s	remaining: 21.2ms
994:	learn: 0.2001325	total: 3.52s	remaining: 17.7ms
995:	learn: 0.1999855	total: 3.52s	remaining: 14.1ms
996:	learn: 0.1996729	total: 3.52s	remaining: 10.6ms
997:	learn: 0.1994904	total: 3.52s	remaining: 7.06ms
998:	learn: 0.1993044	total: 3.53s	remaining: 3.53ms
999:	learn: 0.1989547	total: 3.53s	remaining: 0us
In [28]:
df_compare
Out[28]:
Model Mean Squared Error Mean Absolute Error R2 Score Training Time (s)
0 Ridge Regression 0.746614 0.598242 0.98175 0.002362
1 Linear Regression 0.77558 0.613905 0.981042 0.008197
2 Lasso Regression 1.066347 0.782536 0.973934 0.089346
3 Elastic Net 1.070319 0.718474 0.973837 0.100495
4 Gradient Boosting 87.40121 7.495735 -1.136431 2.311828
5 AdaBoost 89.525725 7.622743 -1.188363 0.722531
6 Decision Tree 91.434513 7.735897 -1.235021 0.077579
7 Random Forest 92.824516 7.815107 -1.268998 4.073694
11 CatBoost 93.180853 7.853729 -1.277709 3.699676
8 XGBoost 94.682514 7.893578 -1.314415 0.462786
9 K-Neighbors Regressor 330.959365 16.750109 -7.089956 0.000574
10 SVR 349.552891 17.672316 -7.544455 0.142434

We trained a variety of regression models to predict stock prices, including Linear Regression, Ridge Regression, Lasso Regression, Elastic Net, Support Vector Regression (SVR), K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, AdaBoost, XGBoost, and CatBoost. The training results show a variety of metrics for different regression models, which are useful in evaluating their performance. Let's break down what each metric means and its significance:

  • Mean Squared Error (MSE): MSE is a measure of the average squared difference between the actual and predicted values. The lower the MSE, the better the model's performance. A high MSE, as seen in models like SVR, Decision Tree, and Random Forest, indicates poor model performance.
  • Mean Absolute Error (MAE): MAE measures the average magnitude of errors between predicted and actual values, without considering their direction (i.e., no squaring or rooting). Similar to MSE, a lower MAE is better. High MAE values in models like SVR suggest significant average errors in predictions.
  • R2 Score: The R2 Score, or the coefficient of determination, measures how well the regression predictions approximate the real data points. An R2 Score of 1 indicates perfect prediction. Positive values close to 1 indicate good model performance. The Linear and Ridge Regression models have high R2 scores, implying they fit the data well. Negative R2 scores, as seen in SVR, K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, and AdaBoost, indicate that these models perform worse than a simple model that would always predict the mean value of the target variable.
  • Training Time (s): This measures how long it takes for each model to be trained. Shorter training times are generally preferred, especially when dealing with large datasets or in scenarios where model training needs to be performed frequently. The Random Forest model, for example, has a significantly longer training time compared to others, which might be a drawback in time-sensitive applications.
  • Analysis of Results: Linear Regression and Ridge Regression show the best performance in terms of MSE, MAE, and R2 Score. They are also efficient with relatively low training times. Lasso Regression performs moderately well, with a reasonable R2 score but much higher MSE and MAE than Linear and Ridge Regression. Models like SVR, K-Neighbors Regressor, Decision Tree, Random Forest, Gradient Boosting, and AdaBoost have very poor R2 scores, indicating that they are not suitable for this particular dataset or require parameter tuning. The Random Forest model, despite its popularity for handling complex datasets, shows poor performance and extremely high training time in this case, which might be due to overfitting, the need for hyperparameter tuning, or the nature of the dataset.
  • Conclusion: The choice of the best model depends on a balance of these metrics and the specific requirements of the application (e.g., prediction accuracy vs. training time). For this dataset, Linear Regression and Ridge Regression seem to be the most effective. However, for practical deployment, one should also consider factors like model complexity, interpretability, and how the model will generalize to unseen data.

3. Linear Regression Model¶

3.a Linear Regression Model with All Features¶

In [29]:
# Train the linear regression model
lr_model_base = LinearRegression()
lr_model_base.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
lr_pred_base = lr_model_base.predict(X_test_scaled)
In [30]:
prediction_df = pd.DataFrame()
prediction_df['date'] = df[df.date.dt.year>=2020]['date']
prediction_df['y_test'] = y_test
prediction_df['lr_pred_base'] = lr_pred_base

prediction_df.head()
Out[30]:
date y_test lr_pred_base
1729 2020-01-02 54.240002 54.157799
1730 2020-01-03 54.150002 54.553547
1731 2020-01-06 53.919998 54.336899
1732 2020-01-07 54.049999 53.907121
1733 2020-01-08 54.189999 54.192608
In [31]:
lr_score_base = evaluate_regression_model(y_test, lr_pred_base)
Mean Squared Error (MSE): 0.776
Root Mean Squared Error (RMSE): 0.881
Mean Absolute Error (MAE): 0.614
R-squared (R2): 0.981
In [32]:
lr_score_base
Out[32]:
{'MSE': 0.7755799343709945,
 'RMSE': 0.880670162076015,
 'MAE': 0.6139047642970673,
 'R2': 0.9810417591179742}
In [33]:
plot_regression_accuracy(y_test, lr_pred_base)
In [34]:
plot_predictions(df,lr_pred_base)
In [35]:
lr_base_feature_importance = plot_feature_importance(lr_model_base,X_train,20)
In [36]:
lr_base_feature_importance[:15]
Out[36]:
Feature Importance
0 adj close_10d_avg 59.927343
1 close_10d_avg 52.842967
2 close_15d_avg 32.217386
3 ema_9 31.326586
4 adj close_15d_avg 29.982687
5 close_5d_avg 17.929065
6 sma_5 13.495880
7 adj close_5d_avg 12.461717
8 adj close_3d_avg 10.395039
9 close_1d_ago 9.861051
10 adj close_7d_ago 9.733592
11 adj close_1d_ago 9.044414
12 close_7d_ago 7.605479
13 sma_15 6.911237
14 adj close_14d_ago 5.986732

3.b. Linear Regression Model with top 20 Features¶

In [37]:
keep_cols20 = lr_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)
In [38]:
# Train the linear regression model
lr_model20 = LinearRegression()
lr_model20.fit(X_train_scaled20, y_train)

# Make predictions on the scaled test set
lr_pred20 = lr_model20.predict(X_test_scaled20)
lr_score20 = evaluate_regression_model(y_test, lr_pred20)
Mean Squared Error (MSE): 0.768
Root Mean Squared Error (RMSE): 0.877
Mean Absolute Error (MAE): 0.613
R-squared (R2): 0.981
In [39]:
prediction_df['lr_pred20'] = lr_pred20

prediction_df.head()
Out[39]:
date y_test lr_pred_base lr_pred20
1729 2020-01-02 54.240002 54.157799 54.239022
1730 2020-01-03 54.150002 54.553547 54.520826
1731 2020-01-06 53.919998 54.336899 54.065422
1732 2020-01-07 54.049999 53.907121 54.067429
1733 2020-01-08 54.189999 54.192608 53.987340
In [40]:
lr_score20
Out[40]:
{'MSE': 0.7682885573217368,
 'RMSE': 0.8765207112908039,
 'MAE': 0.6133462584657988,
 'R2': 0.9812199892092073}
In [41]:
plot_feature_importance(lr_model20,X_train20,20)
Out[41]:
Feature Importance
0 adj close_10d_avg 39.701594
1 close_10d_avg 34.492844
2 ema_9 16.380613
3 adj close_7d_avg 14.543602
4 sma_5 11.032622
5 close_15d_avg 10.687774
6 sma_15 9.954755
7 adj close_5d_avg 8.830203
8 adj close_7d_ago 8.207036
9 adj close_15d_avg 7.303476
10 close_7d_ago 7.068989
11 adj close_1d_ago 6.587839
12 close_5d_avg 6.427914
13 close_1d_ago 4.837201
14 adj close_3d_avg 4.708264
15 close_3d_ago 4.101885
16 adj close_3d_ago 3.394063
17 low_5d_avg 1.134743
18 low_10d_avg 0.507845
19 adj close_14d_ago 0.220884

3.c Linear Regression Model with top 15 Features¶

In [42]:
keep_cols15 = lr_base_feature_importance[:15]['Feature'].tolist()

X_train15 = X_train[keep_cols15]
X_test15 = X_test[keep_cols15]

scaler = StandardScaler()
X_train_scaled15 = scaler.fit_transform(X_train15)
X_test_scaled15 = scaler.transform(X_test15)
In [43]:
# Train the linear regression model
lr_model15 = LinearRegression()
lr_model15.fit(X_train_scaled15, y_train)

# Make predictions on the scaled test set
lr_pred15 = lr_model15.predict(X_test_scaled15)
lr_score15 = evaluate_regression_model(y_test, lr_pred15)
Mean Squared Error (MSE): 0.797
Root Mean Squared Error (RMSE): 0.893
Mean Absolute Error (MAE): 0.626
R-squared (R2): 0.981
In [44]:
prediction_df['lr_pred15'] = lr_pred15

prediction_df.head()
Out[44]:
date y_test lr_pred_base lr_pred20 lr_pred15
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897
In [45]:
lr_score15
Out[45]:
{'MSE': 0.7968925999367115,
 'RMSE': 0.8926884114497686,
 'MAE': 0.6261544545364333,
 'R2': 0.9805207932836008}
In [46]:
plot_feature_importance(lr_model15,X_train15,15)
Out[46]:
Feature Importance
0 close_10d_avg 29.471317
1 adj close_10d_avg 24.455782
2 ema_9 21.480216
3 adj close_5d_avg 16.328249
4 adj close_1d_ago 10.320838
5 adj close_3d_avg 9.162211
6 close_1d_ago 7.864891
7 adj close_7d_ago 7.162200
8 sma_15 6.734176
9 close_7d_ago 6.356886
10 close_5d_avg 4.707638
11 close_15d_avg 3.343602
12 sma_5 2.379653
13 adj close_15d_avg 2.216233
14 adj close_14d_ago 0.415949

3.d. Linear Regression Model with top 10 Features¶

In [47]:
keep_cols10 = lr_base_feature_importance[:10]['Feature'].tolist()

X_train10 = X_train[keep_cols10]
X_test10 = X_test[keep_cols10]

scaler = StandardScaler()
X_train_scaled10 = scaler.fit_transform(X_train10)
X_test_scaled10 = scaler.transform(X_test10)
In [48]:
# Train the linear regression model
lr_model10 = LinearRegression()
lr_model10.fit(X_train_scaled10, y_train)

# Make predictions on the scaled test set
lr_pred10 = lr_model10.predict(X_test_scaled10)
lr_score10 = evaluate_regression_model(y_test, lr_pred10)
Mean Squared Error (MSE): 0.786
Root Mean Squared Error (RMSE): 0.887
Mean Absolute Error (MAE): 0.626
R-squared (R2): 0.981
In [49]:
prediction_df['lr_pred10'] = lr_pred10

prediction_df.head()
Out[49]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508
In [50]:
lr_score10
Out[50]:
{'MSE': 0.7864993593280865,
 'RMSE': 0.8868479911056271,
 'MAE': 0.6260216655405225,
 'R2': 0.9807748451875646}
In [51]:
plot_feature_importance(lr_model10,X_train10,10)
Out[51]:
Feature Importance
0 ema_9 16.842727
1 close_5d_avg 14.158982
2 adj close_3d_avg 8.615665
3 adj close_5d_avg 6.493537
4 close_15d_avg 6.467632
5 close_10d_avg 6.069412
6 sma_5 3.130954
7 adj close_10d_avg 1.666197
8 close_1d_ago 0.905094
9 adj close_15d_avg 0.029654

4. Ridge Regression Parameter Fine Tuning¶

4.a. Ridge Regression with All Features¶

In [52]:
ridge_model = Ridge()

    # Define the hyperparameter grid to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_ridge_model = grid_search.best_estimator_

    # Make predictions on the test set
ridge_pred_base = best_ridge_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, ridge_pred_base)
rmse = mean_squared_error(y_test, ridge_pred_base, squared=False)
mae = mean_absolute_error(y_test, ridge_pred_base)
r2 = r2_score(y_test, ridge_pred_base)

print("Best Ridge Regression Model:")
print(f"Best alpha: {best_ridge_model.alpha}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

ridge_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Ridge Regression Model:
Best alpha: 0.001
Root Mean Squared Error (RMSE): 0.871
Mean Squared Error: 0.759
Mean Absolute Error: 0.606
R2 Score: 0.981
In [53]:
ridge_base_feature_importance = plot_feature_importance(best_ridge_model,X_train,20)
In [54]:
ridge_base_feature_importance[:20]
Out[54]:
Feature Importance
0 close_5d_avg 19.627386
1 ema_9 18.897025
2 sma_5 18.403314
3 adj close_5d_avg 11.844801
4 close_10d_avg 9.872194
5 adj close_10d_avg 9.389002
6 close_15d_avg 7.887685
7 close_1d_ago 7.038813
8 close_7d_avg 6.958126
9 adj close_1d_ago 6.600528
10 close_5d_ago 6.008206
11 sma_15 5.538994
12 low_5d_avg 4.965283
13 low_10d_avg 4.840114
14 open_5d_avg 4.446916
15 high_5d_avg 3.714418
16 open_10d_avg 3.486739
17 high_30d_avg 2.896473
18 close_3d_ago 2.893431
19 open_15d_avg 2.892245
In [55]:
prediction_df['ridge_pred_base'] = ridge_pred_base

prediction_df.head()
Out[55]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624

4.b. Ridge Regression with top 20 Features¶

In [56]:
keep_cols20 = ridge_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
ridge_model20 = Ridge(alpha=0.001)
ridge_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
ridge_pred20 = ridge_model20.predict(X_test_scaled20)
ridge_score20 = evaluate_regression_model(y_test, ridge_pred20)
Mean Squared Error (MSE): 0.73
Root Mean Squared Error (RMSE): 0.854
Mean Absolute Error (MAE): 0.586
R-squared (R2): 0.982
In [57]:
plot_feature_importance(ridge_model20,X_train20,20)
Out[57]:
Feature Importance
0 sma_5 25.441510
1 close_5d_avg 22.435160
2 adj close_5d_avg 11.490239
3 adj close_1d_ago 5.671591
4 adj close_10d_avg 5.507894
5 close_5d_ago 5.147037
6 close_1d_ago 4.554521
7 close_10d_avg 4.509604
8 close_15d_avg 3.719596
9 low_5d_avg 3.022988
10 sma_15 2.915010
11 ema_9 2.899160
12 low_10d_avg 1.985008
13 high_5d_avg 1.745675
14 open_10d_avg 1.590243
15 open_5d_avg 1.561036
16 open_15d_avg 1.264441
17 close_3d_ago 0.798379
18 close_7d_avg 0.640924
19 high_30d_avg 0.200250
In [58]:
prediction_df['ridge_pred20'] = ridge_pred20

prediction_df.head()
Out[58]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649

5. Lasso Regression Model Parameter Fine Tuning¶

5.a Lasso Regression with All Features¶

In [59]:
lasso_model = Lasso()

param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_lasso_model = grid_search.best_estimator_

    # Make predictions on the test set
lasso_pred_base = best_lasso_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, lasso_pred_base)
rmse = mean_squared_error(y_test, lasso_pred_base, squared=False)
mae = mean_absolute_error(y_test, lasso_pred_base)
r2 = r2_score(y_test, lasso_pred_base)

print("Best Lasso Regression Model:")
print(f"Best alpha: {best_lasso_model.alpha}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

lasso_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Lasso Regression Model:
Best alpha: 0.001
Root Mean Squared Error (RMSE): 0.97
Mean Squared Error: 0.94
Mean Absolute Error: 0.663
R2 Score: 0.977
In [60]:
lasso_base_feature_importance = plot_feature_importance(best_lasso_model,X_train,20)
In [61]:
lasso_base_feature_importance[:20]
Out[61]:
Feature Importance
0 ema_9 4.177865
1 close_3d_avg 1.314731
2 macd 1.312350
3 macd_signal 1.160117
4 sma_15 1.006266
5 adj close_3d_avg 0.822478
6 low_1d_ago 0.722646
7 close_3d_ago 0.583764
8 rsi 0.543082
9 open_3d_ago 0.510853
10 sma_30 0.492814
11 high_1d_ago 0.466770
12 adj close_3d_ago 0.422750
13 low_15d_avg 0.404237
14 open_1d_ago 0.365671
15 high_14d_ago 0.307277
16 low_30d_avg 0.291303
17 sma_10 0.255037
18 open_30d_avg 0.241245
19 high_30d_avg 0.227913
In [62]:
prediction_df['lasso_pred_base'] = lasso_pred_base

prediction_df.head()
Out[62]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350

5.b Lasso Regression with Top 20 Features¶

In [63]:
keep_cols20 = lasso_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
lasso_model20 = Lasso(alpha=0.001)
lasso_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
lasso_pred20 = lasso_model20.predict(X_test_scaled20)
lasso_score20 = evaluate_regression_model(y_test, lasso_pred20)
Mean Squared Error (MSE): 0.948
Root Mean Squared Error (RMSE): 0.974
Mean Absolute Error (MAE): 0.665
R-squared (R2): 0.977
In [64]:
plot_feature_importance(lasso_model20,X_train20,20)
Out[64]:
Feature Importance
0 ema_9 4.017287
1 close_3d_avg 2.790781
2 low_1d_ago 1.047406
3 close_3d_ago 1.034968
4 sma_30 1.029511
5 low_15d_avg 0.997798
6 macd 0.968169
7 macd_signal 0.880009
8 high_14d_ago 0.568316
9 high_1d_ago 0.533765
10 rsi 0.496376
11 sma_10 0.423288
12 adj close_3d_avg 0.228550
13 open_3d_ago 0.204478
14 open_1d_ago 0.139447
15 low_30d_avg 0.025033
16 sma_15 0.000000
17 adj close_3d_ago 0.000000
18 open_30d_avg 0.000000
19 high_30d_avg 0.000000
In [65]:
prediction_df['lasso_pred20'] = lasso_pred20

prediction_df.head()
Out[65]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285

6. Elastic Net Regression Model Parameter Fine Tuning¶

6.a. Elastic Net with All Features¶

In [66]:
elastic_net_model = ElasticNet()

    # Define the hyperparameter grid to search
param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    }

    # Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=elastic_net_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

    # Get the best model
best_elastic_net_model = grid_search.best_estimator_

    # Make predictions on the test set
elastic_pred_base = best_elastic_net_model.predict(X_test_scaled)

    # Evaluate the best model
mse = mean_squared_error(y_test, elastic_pred_base)
rmse = mean_squared_error(y_test, elastic_pred_base, squared=False)
mae = mean_absolute_error(y_test, elastic_pred_base)
r2 = r2_score(y_test, elastic_pred_base)

print("Best Elastic Net Model:")
print(f"Best alpha: {best_elastic_net_model.alpha}")
print(f"Best l1_ratio: {best_elastic_net_model.l1_ratio}")
print(f'Root Mean Squared Error (RMSE): {np.round(rmse,3)}')
print(f"Mean Squared Error: {np.round(mse,3)}")
print(f"Mean Absolute Error: {np.round(mae,3)}")
print(f"R2 Score: {np.round(r2,3)}")

elastic_score = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
Best Elastic Net Model:
Best alpha: 0.001
Best l1_ratio: 0.1
Root Mean Squared Error (RMSE): 0.953
Mean Squared Error: 0.908
Mean Absolute Error: 0.653
R2 Score: 0.978
In [67]:
elastic_base_feature_importance = plot_feature_importance(best_elastic_net_model,X_train,20)
In [68]:
elastic_base_feature_importance[:20]
Out[68]:
Feature Importance
0 ema_9 1.613307
1 sma_5 1.518359
2 macd 1.493389
3 close_3d_avg 1.290926
4 sma_10 1.289912
5 macd_signal 1.253588
6 adj close_3d_avg 1.239816
7 low_1d_ago 0.982940
8 sma_15 0.923173
9 high_1d_ago 0.815800
10 close_3d_ago 0.765736
11 open_1d_ago 0.631358
12 sma_30 0.586684
13 low_3d_avg 0.580421
14 adj close_3d_ago 0.544949
15 open_3d_ago 0.531747
16 rsi 0.485403
17 high_3d_avg 0.431768
18 close_1d_ago 0.425664
19 low_15d_avg 0.399066
In [69]:
prediction_df['elastic_pred_base'] = elastic_pred_base

prediction_df.head()
Out[69]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049

6.b. Elastic Net with Top 20 Features¶

In [70]:
keep_cols20 = elastic_base_feature_importance[:20]['Feature'].tolist()

X_train20 = X_train[keep_cols20]
X_test20 = X_test[keep_cols20]

scaler = StandardScaler()
X_train_scaled20 = scaler.fit_transform(X_train20)
X_test_scaled20 = scaler.transform(X_test20)

# Train model
elastic_model20 = ElasticNet(alpha=0.001,l1_ratio =  0.9)
elastic_model20.fit(X_train_scaled20, y_train)


# Make predictions on the scaled test set
elastic_pred20 = elastic_model20.predict(X_test_scaled20)
elastic_score20 = evaluate_regression_model(y_test, elastic_pred20)
Mean Squared Error (MSE): 0.948
Root Mean Squared Error (RMSE): 0.974
Mean Absolute Error (MAE): 0.667
R-squared (R2): 0.977
In [71]:
plot_feature_importance(elastic_model20,X_train20,20)
Out[71]:
Feature Importance
0 ema_9 3.518012
1 close_3d_avg 2.975563
2 low_15d_avg 1.164547
3 sma_30 1.008792
4 low_1d_ago 0.912273
5 high_1d_ago 0.840268
6 sma_10 0.826143
7 adj close_3d_avg 0.783827
8 macd_signal 0.701467
9 macd 0.690383
10 sma_15 0.565839
11 adj close_3d_ago 0.556647
12 rsi 0.506954
13 close_3d_ago 0.408292
14 open_3d_ago 0.238611
15 high_3d_avg 0.039417
16 low_3d_avg 0.000000
17 sma_5 0.000000
18 close_1d_ago 0.000000
19 open_1d_ago 0.000000
In [72]:
prediction_df['elastic_pred20'] = elastic_pred20

prediction_df.head()
Out[72]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base elastic_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972 54.503795
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628 54.263269
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290 54.055709
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335 53.920259
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049 54.058664

7. Model Comparison¶

In [73]:
ela_df = pd.DataFrame([elastic_score.keys(),elastic_score.values()])
ela_df.columns = ela_df.iloc[0]
ela_df = ela_df[1:].reset_index(drop=True)
ela_df['Model'] = 'Elastic_Net with All Features'

ela_20_df = pd.DataFrame([elastic_score20.keys(),elastic_score20.values()])
ela_20_df.columns = ela_20_df.iloc[0]
ela_20_df = ela_20_df[1:].reset_index(drop=True)
ela_20_df['Model'] = 'Elastic_Net with Top 20 Features'

lasso_df = pd.DataFrame([lasso_score.keys(),lasso_score.values()])
lasso_df.columns = lasso_df.iloc[0]
lasso_df = lasso_df[1:].reset_index(drop=True)
lasso_df['Model'] = 'Lasso with All Features'

lasso_20_df = pd.DataFrame([lasso_score20.keys(),lasso_score20.values()])
lasso_20_df.columns = lasso_20_df.iloc[0]
lasso_20_df = lasso_20_df[1:].reset_index(drop=True)
lasso_20_df['Model'] = 'Lasso with Top 20 Features'

ridge_df = pd.DataFrame([ridge_score.keys(),ridge_score.values()])
ridge_df.columns = ridge_df.iloc[0]
ridge_df = ridge_df[1:].reset_index(drop=True)
ridge_df['Model'] = 'Ridge with All Features'

ridge_20_df = pd.DataFrame([ridge_score20.keys(),ridge_score20.values()])
ridge_20_df.columns = ridge_20_df.iloc[0]
ridge_20_df = ridge_20_df[1:].reset_index(drop=True)
ridge_20_df['Model'] = 'Ridge with Top 20 Features'

lr_base_df = pd.DataFrame([lr_score_base.keys(),lr_score_base.values()])
lr_base_df.columns = lr_base_df.iloc[0]
lr_base_df = lr_base_df[1:].reset_index(drop=True)
lr_base_df['Model'] = 'Linear Reg. with All Features'

lr_20_df = pd.DataFrame([lr_score20.keys(),lr_score20.values()])
lr_20_df.columns = lr_20_df.iloc[0]
lr_20_df = lr_20_df[1:].reset_index(drop=True)
lr_20_df['Model'] = 'Linear Reg. with Top 20 Features'

lr_15_df = pd.DataFrame([lr_score15.keys(),lr_score15.values()])
lr_15_df.columns = lr_15_df.iloc[0]
lr_15_df = lr_15_df[1:].reset_index(drop=True)
lr_15_df['Model'] = 'Linear Reg. with Top 15 Features'

lr_10_df = pd.DataFrame([lr_score10.keys(),lr_score10.values()])
lr_10_df.columns = lr_10_df.iloc[0]
lr_10_df = lr_10_df[1:].reset_index(drop=True)
lr_10_df['Model'] = 'Linear Reg. with Top 10 Features'

df_compare = pd.concat([ela_df,lasso_df,ridge_df,ela_20_df,lasso_20_df,ridge_20_df,
                        lr_base_df,lr_20_df,lr_15_df,lr_10_df]).sort_values(by=['R2'],ascending=False).reset_index(drop=True)

df_compare
Out[73]:
MSE RMSE MAE R2 Model
0 0.729511 0.854114 0.585601 0.982168 Ridge with Top 20 Features
1 0.758611 0.870983 0.605987 0.981457 Ridge with All Features
2 0.768289 0.876521 0.613346 0.98122 Linear Reg. with Top 20 Features
3 0.77558 0.88067 0.613905 0.981042 Linear Reg. with All Features
4 0.786499 0.886848 0.626022 0.980775 Linear Reg. with Top 10 Features
5 0.796893 0.892688 0.626154 0.980521 Linear Reg. with Top 15 Features
6 0.90796 0.952869 0.652726 0.977806 Elastic_Net with All Features
7 0.940304 0.969693 0.663218 0.977015 Lasso with All Features
8 0.948047 0.973677 0.664517 0.976826 Lasso with Top 20 Features
9 0.948242 0.973777 0.666597 0.976821 Elastic_Net with Top 20 Features

After retraining the models with different alpha and input features, Ridge regression model with alpha 0.001 and all features performed best among others.

  • Mean Squared Error (MSE) 0.729511:

MSE measures the average squared difference between predicted and actual values. In this case, the MSE of 0.729511 is relatively low, indicating that, on average, the squared errors between predicted and actual values are small. Lower MSE values suggest better accuracy.

  • Root Mean Squared Error (RMSE) 0.854114:

RMSE is the square root of the MSE and provides a measure of the average magnitude of the errors. A lower RMSE (0.854114) signifies that, on average, the model's predictions are close to the actual values. It is in the same unit as the target variable.

  • Mean Absolute Error (MAE) 0.585601:

MAE measures the average absolute difference between predicted and actual values. With an MAE of 0.585601, the model's predictions, on average, deviate by approximately 0.60886 units from the actual values. Lower MAE values indicate better accuracy.

  • R-squared (R2) 0.982168:

R2 represents the proportion of variance in the target variable that is predictable from the independent variables. An R2 value of 0.982168 is exceptionally high, indicating that the model explains about 98.21% of the variance in the closing stock prices. A higher R2 value suggests a better accuracy.

In summary, the provided accuracy scores collectively suggest that the model performs exceptionally well. The low MSE, RMSE, MAE and high R2 score indicate that the model's predictions are close to the actual values.

In [74]:
prediction_df
Out[74]:
date y_test lr_pred_base lr_pred20 lr_pred15 lr_pred10 ridge_pred_base ridge_pred20 lasso_pred_base lasso_pred20 elastic_pred_base elastic_pred20
1729 2020-01-02 54.240002 54.157799 54.239022 54.554907 54.590333 54.205840 54.349344 54.496024 54.501734 54.444972 54.503795
1730 2020-01-03 54.150002 54.553547 54.520826 54.558027 54.511431 54.542331 54.381596 54.190970 54.262946 54.198628 54.263269
1731 2020-01-06 53.919998 54.336899 54.065422 54.148986 54.126188 54.345656 54.233187 53.975755 54.001497 54.011290 54.055709
1732 2020-01-07 54.049999 53.907121 54.067429 53.903359 53.992480 53.873857 54.036503 53.828707 53.859163 53.818335 53.920259
1733 2020-01-08 54.189999 54.192608 53.987340 53.942897 54.081508 54.195624 54.059649 54.002350 53.989285 54.037049 54.058664
... ... ... ... ... ... ... ... ... ... ... ... ...
2694 2023-11-01 67.970001 67.251327 66.691627 66.293326 66.511564 67.298025 66.970336 66.962254 67.016646 67.047164 67.123766
2695 2023-11-02 68.820000 68.295223 67.635666 67.398789 67.681439 68.052648 67.865085 67.502858 67.485406 67.584452 67.672327
2696 2023-11-03 68.239998 68.864264 68.759090 68.689217 68.943305 68.811739 68.806564 68.155329 68.270610 68.171208 68.378378
2697 2023-11-06 68.489998 68.041446 68.134383 68.593341 68.842344 68.125227 68.286008 68.186065 68.384744 68.222377 68.472590
2698 2023-11-07 69.019997 68.239220 68.843554 69.136339 69.178896 68.336189 68.670691 68.368078 68.539077 68.463381 68.531877

970 rows × 12 columns

In [75]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred_base,label='lr_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred20,label='lr_pred20')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred15,label='lr_pred15')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred10,label='lr_pred10')

sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred_base,label='ridge_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred20,label='ridge_pred20')

sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred_base,label='lasso_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred20,label='lasso_pred20')

sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred_base,label='elastic_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred20,label='elastic_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [76]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred_base,label='lr_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred20,label='lr_pred20')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred15,label='lr_pred15')
sns.lineplot(x = prediction_df.date, y=prediction_df.lr_pred10,label='lr_pred10')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [77]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred_base,label='ridge_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.ridge_pred20,label='ridge_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [78]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred_base,label='lasso_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.lasso_pred20,label='lasso_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()
In [79]:
plt.figure(figsize=(20, 10))
sns.lineplot(x = prediction_df.date, y=prediction_df.y_test,label='y_test')

sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred_base,label='elastic_pred_base')
sns.lineplot(x = prediction_df.date, y=prediction_df.elastic_pred20,label='elastic_pred20')

plt.legend(prop={'size': 14, 'weight': 'bold'})
plt.title('Model Prediction Comparison', fontsize=16)
plt.ylabel('Prediction', fontsize=14)
plt.xlabel('Date', fontsize=14)

plt.show()

7.a Final Best Model¶

In [80]:
# target column is next day's close price
y_train = train_df['close_1d_next'].copy()
X_train = train_df.drop(['close_1d_next'], 1)

# target column is next day's close price
y_test  = test_df['close_1d_next'].copy()
X_test  = test_df.drop(['close_1d_next'], 1)
In [81]:
ridge_20_features = ridge_base_feature_importance[:20]['Feature'].tolist()
X_train = X_train[ridge_20_features]
X_test = X_test[ridge_20_features]
In [82]:
def train_ridge_regression(X_train,X_test,y_train,y_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    ridge_model = Ridge(alpha=0.001)
    ridge_model.fit(X_train_scaled, y_train)


    # Make predictions on the scaled test set
    ridge_pred = ridge_model.predict(X_test_scaled)
    ridge_score = evaluate_regression_model2(y_test, ridge_pred)

    return ridge_model,ridge_pred,ridge_score
In [83]:
ridge_model, ridge_pred, ridge_score = train_ridge_regression(X_train,X_test,y_train,y_test)
In [84]:
ridge_score
Out[84]:
{'MSE': 0.7295114881722916,
 'RMSE': 0.8541144467647714,
 'MAE': 0.5856010765186319,
 'R2': 0.9821678541358965}
In [85]:
ridge_pred[:15]
Out[85]:
array([54.34934416, 54.38159609, 54.23318706, 54.03650291, 54.05964943,
       54.17549586, 54.25017275, 54.54902816, 54.18781481, 54.74202716,
       55.14666847, 55.18770245, 55.53205975, 55.4774224 , 55.51006433])
In [86]:
plot_regression_accuracy(y_test, ridge_pred)
In [87]:
plot_predictions(df,ridge_pred)
In [88]:
plot_feature_importance(ridge_model,X_train,20)
Out[88]:
Feature Importance
0 sma_5 25.441510
1 close_5d_avg 22.435160
2 adj close_5d_avg 11.490239
3 adj close_1d_ago 5.671591
4 adj close_10d_avg 5.507894
5 close_5d_ago 5.147037
6 close_1d_ago 4.554521
7 close_10d_avg 4.509604
8 close_15d_avg 3.719596
9 low_5d_avg 3.022988
10 sma_15 2.915010
11 ema_9 2.899160
12 low_10d_avg 1.985008
13 high_5d_avg 1.745675
14 open_10d_avg 1.590243
15 open_5d_avg 1.561036
16 open_15d_avg 1.264441
17 close_3d_ago 0.798379
18 close_7d_avg 0.640924
19 high_30d_avg 0.200250

The residual, scatter, and time series line charts above clearly show that the predicted values are very close to the actual values. These visualizations confirm that the model is very good at making accurate predictions, highlighting its strong performance and reliability in understanding the details of the data.

8. Train Multiple Different Stocks with Ridge Regression Model¶

In [89]:
df_all = pd.read_parquet(out_loc+"stock_1d.parquet")
df_all.columns = df_all.columns.str.lower()
In [90]:
### keep stocks in data with min year 2013, max year 2023
stock_min_dt = pd.DataFrame(df_all.groupby('symbol')['date'].min()).reset_index().rename(columns={'date':'min_date'})
stock_max_dt = pd.DataFrame(df_all.groupby('symbol')['date'].max()).reset_index().rename(columns={'date':'max_date'})
stock_cnt_dt = pd.DataFrame(df_all.groupby('symbol')['date'].count()).reset_index().rename(columns={'date':'days_cnt'})

stock_cnt = stock_min_dt.merge(stock_max_dt,on='symbol').merge(stock_cnt_dt,on='symbol')
stock_cnt['min_year'] = stock_cnt['min_date'].dt.year
stock_cnt['max_year'] = stock_cnt['max_date'].dt.year

keep_stocks = stock_cnt[(stock_cnt['min_year']==2013)&(stock_cnt['max_year']==2023)&(stock_cnt['days_cnt']>=2500)]['symbol'].unique().tolist()

stock_cnt.head()
Out[90]:
symbol min_date max_date days_cnt min_year max_year
0 A 2013-01-02 2023-11-08 2733 2013 2023
1 AAL 2013-01-02 2023-11-08 2733 2013 2023
2 AAPL 2013-01-02 2023-11-08 2733 2013 2023
3 ABBV 2013-01-02 2023-11-08 2733 2013 2023
4 ABNB 2020-12-10 2023-11-08 733 2020 2023
In [91]:
df_2023 = df_all[(df_all.date.dt.year==2023) & (df_all.symbol.isin(keep_stocks))]
# volume vs stocks
volume_2023 = pd.DataFrame(df_2023.groupby(['symbol','security','gics sector'])['volume'].sum()).reset_index()
volume_2023 = volume_2023.sort_values(by='volume',ascending=False).reset_index(drop=True)
volume_2023.head()
Out[91]:
symbol security gics sector volume
0 TSLA Tesla, Inc. Consumer Discretionary 3.009291e+10
1 AMD AMD Information Technology 1.342035e+10
2 AMZN Amazon Consumer Discretionary 1.305160e+10
3 AAPL Apple Inc. Information Technology 1.303964e+10
4 F Ford Motor Company Consumer Discretionary 1.278319e+10
In [92]:
# volume vs sectors
sector_2023 = pd.DataFrame(df_2023.groupby(['gics sector'])['volume'].sum()).reset_index()
sector_2023 = sector_2023.sort_values(by='volume',ascending=False).reset_index(drop=True)
sector_2023
Out[92]:
gics sector volume
0 Consumer Discretionary 9.171407e+10
1 Information Technology 8.888840e+10
2 Financials 6.728113e+10
3 Communication Services 5.267892e+10
4 Health Care 3.755560e+10
5 Industrials 3.672492e+10
6 Energy 3.245171e+10
7 Consumer Staples 2.824873e+10
8 Utilities 2.214882e+10
9 Materials 1.432867e+10
10 Real Estate 1.318748e+10
In [93]:
# filter top 5 sectors with highest volume in 2023
sector_list = sector_2023[:5]['gics sector'].tolist()

stock_list = []

num_stocks = 5
# stocks with highest volume in each sector
for sec in sector_list:
    stock_list.append(volume_2023[volume_2023['gics sector']==sec]['symbol'][:num_stocks].tolist())
    
    
stock_list = [item for sublist in stock_list for item in sublist]

len(stock_list)
Out[93]:
25
In [94]:
df_stocks = df_all[df_all['symbol'].isin(stock_list)].reset_index(drop=True)
df_stocks.head()
Out[94]:
date open high low close adj close volume symbol security gics sector gics sub-industry headquarters location date added cik founded
0 2013-01-02 18.003504 18.193193 17.931683 18.099348 18.099348 101550348.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
1 2013-01-03 18.141392 18.316566 18.036036 18.109859 18.109859 92635272.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
2 2013-01-04 18.251753 18.555305 18.210211 18.467718 18.467718 110429460.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
3 2013-01-07 18.404655 18.503002 18.282784 18.387136 18.387136 66161772.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
4 2013-01-08 18.406906 18.425926 18.128880 18.350851 18.350851 66976956.0 GOOGL Alphabet Inc. (Class A) Communication Services Interactive Media & Services Mountain View, California 2014-04-03 1652044 1998
In [95]:
def preprocess_data(df):
    add_moving_averages(df, 'close')
    df['rsi'] = rsi(df)
    df['mfi'] = mfi(df, 14)
    df['macd'] = df['close'].ewm(span=12, min_periods=12).mean() - df['close'].ewm(span=26, min_periods=26).mean()
    df['macd_signal'] = df['macd'].ewm(span=9, min_periods=9).mean()
    df['close_1d_next'] = df['close'].shift(-1)

    # Define lag periods and rolling window sizes
    lag_periods = [1, 3, 5, 7, 14, 21, 28]
    rolling_windows = [3, 5, 7, 10, 15, 30]

    # Columns to create features for
    columns = ['close', 'adj close', 'open', 'high', 'low', 'volume']

    # Add lagged and rolling average features for each column
    for column in columns:
        add_lagged_features(df, column, lag_periods)
        add_rolling_avg_features(df, column, rolling_windows)

    #df = df.dropna().reset_index(drop=True)

    return df
In [96]:
stock_compare = []

for stock in stock_list:
        stock_data = df_stocks[df_stocks['symbol'] == stock]
        stock_data = preprocess_data(stock_data)
        stock_data = stock_data.dropna().reset_index(drop=True)
        
                # Split the DataFrame into training and testing sets
        train_df_temp = stock_data[stock_data.date.dt.year<2020]
        test_df_temp = stock_data[stock_data.date.dt.year>=2020]

        drop_cols1 = ['date','open','high','low','close','adj close','volume','symbol','security',
         'gics sector','gics sub-industry','headquarters location','date added','cik','founded']

        train_df_temp = train_df_temp.drop(drop_cols1, 1)

        test_df_temp  = test_df_temp.drop(drop_cols1, 1)

        # target column is next day's close price
        y_train_temp = train_df_temp['close_1d_next'].copy()
        X_train_temp = train_df_temp.drop(['close_1d_next'], 1)

        # target column is next day's close price
        y_test_temp  = test_df_temp['close_1d_next'].copy()
        X_test_temp  = test_df_temp.drop(['close_1d_next'], 1)
#        print(stock, len(X_train), len(X_test), len(y_train), len(y_test))

        X_train_temp = X_train_temp[ridge_20_features]
        X_test_temp = X_test_temp[ridge_20_features]

        temp_model, temp_pred, temp_score = train_ridge_regression(X_train_temp,X_test_temp,y_train_temp,y_test_temp)
        
        score_df = pd.DataFrame([temp_score.keys(),temp_score.values()])
        score_df.columns = score_df.iloc[0]
        score_df = score_df[1:].reset_index(drop=True)
        score_df['symbol'] = stock
        
        stock_compare.append(score_df)
        
compare_df = pd.concat(stock_compare).sort_values(by='R2',ascending=False).reset_index(drop =True)
In [97]:
compare_df
Out[97]:
MSE RMSE MAE R2 symbol
0 53.880829 7.340356 4.993183 0.995324 NVDA
1 0.492338 0.701668 0.488305 0.994058 VZ
2 7.288854 2.699788 2.009309 0.993042 AAPL
3 4.806856 2.192454 1.600387 0.992472 GOOG
4 4.773841 2.184912 1.587615 0.992342 GOOGL
5 44.497002 6.670607 4.551507 0.990978 META
6 1.977385 1.406195 0.998546 0.99076 CVS
7 24.89376 4.989365 3.774 0.990518 MSFT
8 1.212324 1.101056 0.819318 0.990392 GM
9 75.230542 8.673554 6.092839 0.990036 TSLA
10 0.153693 0.392037 0.279125 0.989988 F
11 0.5373 0.733008 0.545867 0.989299 BAC
12 0.568511 0.753996 0.543832 0.988847 PFE
13 0.245541 0.495521 0.359251 0.988802 KEY
14 1.632282 1.277608 0.868622 0.988481 INTC
15 0.802889 0.896041 0.613263 0.987932 CCL
16 0.940589 0.96984 0.7127 0.987702 WFC
17 10.046478 3.169618 2.318726 0.987042 AMZN
18 0.151731 0.389527 0.254337 0.985373 T
19 2.091562 1.446223 1.031672 0.984222 C
20 9.521865 3.085752 2.267984 0.98374 AMD
21 0.111184 0.333443 0.243588 0.981839 HBAN
22 0.791798 0.889831 0.641554 0.98043 BMY
23 4.116423 2.028897 1.434524 0.969971 JNJ
24 1.319744 1.148801 0.902782 0.866884 VTRS

The final phase of the project involved applying the developed model to real-world scenarios. By identifying the top 5 industries with the highest volume in 2023, we ensured that our predictions were grounded in current market dynamics. The subsequent selection of 5 stocks within each industry added a layer of practicality to our findings.

The model's stellar performance on NVDA, AAPL, VZ, GOOG, and GOOGL proved its robustness in diverse market conditions. Simultaneously, the challenges encountered with VTRS opened up opportunities for further investigation into the factors contributing to its underperformance.

In [ ]: